Commit b079ca14 authored by Grégory Wisniewski's avatar Grégory Wisniewski

Split election process into shorter methods to highlight logic.

git-svn-id: https://svn.erp5.org/repos/neo/trunk@1463 71dcc9de-d417-0410-9af5-da40c76e7ee4
parent 651e51b3
...@@ -124,6 +124,7 @@ class Application(object): ...@@ -124,6 +124,7 @@ class Application(object):
# Reelect a new primary master. # Reelect a new primary master.
self.electPrimary(bootstrap = False) self.electPrimary(bootstrap = False)
def electPrimary(self, bootstrap = True): def electPrimary(self, bootstrap = True):
"""Elect a primary master node. """Elect a primary master node.
...@@ -136,24 +137,47 @@ class Application(object): ...@@ -136,24 +137,47 @@ class Application(object):
self.unconnected_master_node_set = set() self.unconnected_master_node_set = set()
self.negotiating_master_node_set = set() self.negotiating_master_node_set = set()
self.listening_conn.setHandler(election.ServerElectionHandler(self)) self.listening_conn.setHandler(election.ServerElectionHandler(self))
client_handler = election.ClientElectionHandler(self)
em = self.em
nm = self.nm
for node in nm.getMasterList(): for node in self.nm.getMasterList():
# For now, believe that every node should be available, # For now, believe that every node should be available,
# since down or broken nodes may be already repaired. # since down or broken nodes may be already repaired.
node.setRunning() node.setRunning()
while 1: try:
t = 0 while True:
self.primary = None
self.primary_master_node = None
for node in nm.getMasterList(): # handle new connected masters
for node in self.nm.getMasterList():
if node.isRunning(): if node.isRunning():
self.unconnected_master_node_set.add(node.getAddress()) self.unconnected_master_node_set.add(node.getAddress())
# start the election process
self.primary = None
self.primary_master_node = None
self._doElection(bootstrap)
self.primary = self.primary is None
if self.primary:
# i'm the primary, send the announcement
self._announcePrimary()
else:
# otherwise, wait for the primary announcement
self._waitForPrimaryAnnouncement()
break
except ElectionFailure, m:
# something goes wrong, clean then restart
self._electionFailed(m)
bootstrap = False
def _doElection(self, bootstrap):
"""
Start the election process:
- Try to connect to any known master node
- Wait a most for the timeout defined by bootstrap parameter
When done, the current process si defined either as primary or
secondary master node
"""
# Wait at most 20 seconds at bootstrap. Otherwise, wait at most # Wait at most 20 seconds at bootstrap. Otherwise, wait at most
# 10 seconds to avoid stopping the whole cluster for a long time. # 10 seconds to avoid stopping the whole cluster for a long time.
# Note that even if not all master are up in the first 20 seconds # Note that even if not all master are up in the first 20 seconds
...@@ -163,13 +187,13 @@ class Application(object): ...@@ -163,13 +187,13 @@ class Application(object):
expiration = 20 expiration = 20
else: else:
expiration = 10 expiration = 10
client_handler = election.ClientElectionHandler(self)
try: t = 0
while 1: while True:
current_time = time() current_time = time()
if current_time >= t + 1: if current_time >= t + 1:
t = current_time t = current_time
for node in nm.getMasterList(): for node in self.nm.getMasterList():
if node.isTemporarilyDown() \ if node.isTemporarilyDown() \
and node.getLastStateChange() + \ and node.getLastStateChange() + \
expiration < current_time: expiration < current_time:
...@@ -181,64 +205,74 @@ class Application(object): ...@@ -181,64 +205,74 @@ class Application(object):
# Try to connect to master nodes. # Try to connect to master nodes.
if self.unconnected_master_node_set: if self.unconnected_master_node_set:
for addr in list(self.unconnected_master_node_set): for addr in list(self.unconnected_master_node_set):
ClientConnection(em, client_handler, addr=addr, ClientConnection(self.em, client_handler, addr=addr,
connector_handler=self.connector_handler) connector_handler=self.connector_handler)
em.poll(1) self.em.poll(1)
if len(self.unconnected_master_node_set) == 0 \ if len(self.unconnected_master_node_set) == 0 \
and len(self.negotiating_master_node_set) == 0: and len(self.negotiating_master_node_set) == 0:
break break
# Now there are three situations:
# - I am the primary master def _announcePrimary(self):
# - I am secondary but don't know who is primary """
# - I am secondary and know who is primary Broadcast the announce that I'm the primary
if self.primary is None: """
# I am the primary. # I am the primary.
self.primary = True
logging.debug('I am the primary, sending an announcement') logging.debug('I am the primary, sending an announcement')
for conn in em.getClientList(): for conn in self.em.getClientList():
conn.notify(Packets.AnnouncePrimary()) conn.notify(Packets.AnnouncePrimary())
conn.abort() conn.abort()
t = time() t = time()
while em.getClientList(): while self.em.getClientList():
em.poll(1) self.em.poll(1)
if t + 10 < time(): if t + 10 < time():
for conn in em.getClientList(): for conn in self.em.getClientList():
conn.close() conn.close()
break break
else:
def _waitForPrimaryAnnouncement(self):
"""
For for the primary announcement as i'm not the primary.
If this is too long, raise ElectionFailure to restart the whole
election process.
"""
# Wait for an announcement. If this is too long, probably # Wait for an announcement. If this is too long, probably
# the primary master is down. # the primary master is down.
t = time() t = time()
while self.primary_master_node is None: while self.primary_master_node is None:
em.poll(1) self.em.poll(1)
if t + 10 < time(): if t + 10 < time():
raise ElectionFailure, 'no primary master elected' # election timeout
raise ElectionFailure("Election timeout")
# Now I need only a connection to the primary master node. # Now I need only a connection to the primary master node.
primary = self.primary_master_node primary = self.primary_master_node
addr = primary.getAddress() addr = primary.getAddress()
for conn in em.getServerList(): for conn in self.em.getServerList():
conn.close() conn.close()
for conn in em.getClientList(): for conn in self.em.getClientList():
if conn.getAddress() != addr: if conn.getAddress() != addr:
conn.close() conn.close()
# But if there is no such connection, something wrong # But if there is no such connection, something wrong
# happened. # happened.
for conn in em.getClientList(): for conn in self.em.getClientList():
if conn.getAddress() == addr: if conn.getAddress() == addr:
# primary master elected and connected
break break
else: else:
raise ElectionFailure, 'no connection remains to ' \ raise ElectionFailure('No connection remains to the primary')
'the primary'
return
except ElectionFailure, m: def _electionFailed(self, m):
logging.error('election failed; %s' % m) """
Ask other masters to reelect a primary after an election failure.
"""
logging.error('election failed: %s', (m, ))
# Ask all connected nodes to reelect a single primary master. # Ask all connected nodes to reelect a single primary master.
for conn in em.getClientList(): for conn in self.em.getClientList():
conn.notify(Packets.ReelectPrimary()) conn.notify(Packets.ReelectPrimary())
conn.abort() conn.abort()
...@@ -246,9 +280,9 @@ class Application(object): ...@@ -246,9 +280,9 @@ class Application(object):
self.primary = None self.primary = None
self.primary_master_node = None self.primary_master_node = None
t = time() t = time()
while em.getClientList(): while self.em.getClientList():
try: try:
em.poll(1) self.em.poll(1)
except ElectionFailure: except ElectionFailure:
pass pass
if time() > t + 10: if time() > t + 10:
...@@ -256,11 +290,11 @@ class Application(object): ...@@ -256,11 +290,11 @@ class Application(object):
break break
# Close all connections. # Close all connections.
for conn in em.getClientList(): for conn in self.em.getClientList():
conn.close() conn.close()
for conn in em.getServerList(): for conn in self.em.getServerList():
conn.close() conn.close()
bootstrap = False
def broadcastNodesInformation(self, node_list): def broadcastNodesInformation(self, node_list):
""" """
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment