Commit b079ca14 authored by Grégory Wisniewski's avatar Grégory Wisniewski

Split election process into shorter methods to highlight logic.

git-svn-id: https://svn.erp5.org/repos/neo/trunk@1463 71dcc9de-d417-0410-9af5-da40c76e7ee4
parent 651e51b3
......@@ -124,6 +124,7 @@ class Application(object):
# Reelect a new primary master.
self.electPrimary(bootstrap = False)
def electPrimary(self, bootstrap = True):
"""Elect a primary master node.
......@@ -136,24 +137,47 @@ class Application(object):
self.unconnected_master_node_set = set()
self.negotiating_master_node_set = set()
self.listening_conn.setHandler(election.ServerElectionHandler(self))
client_handler = election.ClientElectionHandler(self)
em = self.em
nm = self.nm
for node in nm.getMasterList():
for node in self.nm.getMasterList():
# For now, believe that every node should be available,
# since down or broken nodes may be already repaired.
node.setRunning()
while 1:
t = 0
self.primary = None
self.primary_master_node = None
try:
while True:
for node in nm.getMasterList():
# handle new connected masters
for node in self.nm.getMasterList():
if node.isRunning():
self.unconnected_master_node_set.add(node.getAddress())
# start the election process
self.primary = None
self.primary_master_node = None
self._doElection(bootstrap)
self.primary = self.primary is None
if self.primary:
# i'm the primary, send the announcement
self._announcePrimary()
else:
# otherwise, wait for the primary announcement
self._waitForPrimaryAnnouncement()
break
except ElectionFailure, m:
# something goes wrong, clean then restart
self._electionFailed(m)
bootstrap = False
def _doElection(self, bootstrap):
"""
Start the election process:
- Try to connect to any known master node
- Wait a most for the timeout defined by bootstrap parameter
When done, the current process si defined either as primary or
secondary master node
"""
# Wait at most 20 seconds at bootstrap. Otherwise, wait at most
# 10 seconds to avoid stopping the whole cluster for a long time.
# Note that even if not all master are up in the first 20 seconds
......@@ -163,13 +187,13 @@ class Application(object):
expiration = 20
else:
expiration = 10
try:
while 1:
client_handler = election.ClientElectionHandler(self)
t = 0
while True:
current_time = time()
if current_time >= t + 1:
t = current_time
for node in nm.getMasterList():
for node in self.nm.getMasterList():
if node.isTemporarilyDown() \
and node.getLastStateChange() + \
expiration < current_time:
......@@ -181,64 +205,74 @@ class Application(object):
# Try to connect to master nodes.
if self.unconnected_master_node_set:
for addr in list(self.unconnected_master_node_set):
ClientConnection(em, client_handler, addr=addr,
ClientConnection(self.em, client_handler, addr=addr,
connector_handler=self.connector_handler)
em.poll(1)
self.em.poll(1)
if len(self.unconnected_master_node_set) == 0 \
and len(self.negotiating_master_node_set) == 0:
break
# Now there are three situations:
# - I am the primary master
# - I am secondary but don't know who is primary
# - I am secondary and know who is primary
if self.primary is None:
def _announcePrimary(self):
"""
Broadcast the announce that I'm the primary
"""
# I am the primary.
self.primary = True
logging.debug('I am the primary, sending an announcement')
for conn in em.getClientList():
for conn in self.em.getClientList():
conn.notify(Packets.AnnouncePrimary())
conn.abort()
t = time()
while em.getClientList():
em.poll(1)
while self.em.getClientList():
self.em.poll(1)
if t + 10 < time():
for conn in em.getClientList():
for conn in self.em.getClientList():
conn.close()
break
else:
def _waitForPrimaryAnnouncement(self):
"""
For for the primary announcement as i'm not the primary.
If this is too long, raise ElectionFailure to restart the whole
election process.
"""
# Wait for an announcement. If this is too long, probably
# the primary master is down.
t = time()
while self.primary_master_node is None:
em.poll(1)
self.em.poll(1)
if t + 10 < time():
raise ElectionFailure, 'no primary master elected'
# election timeout
raise ElectionFailure("Election timeout")
# Now I need only a connection to the primary master node.
primary = self.primary_master_node
addr = primary.getAddress()
for conn in em.getServerList():
for conn in self.em.getServerList():
conn.close()
for conn in em.getClientList():
for conn in self.em.getClientList():
if conn.getAddress() != addr:
conn.close()
# But if there is no such connection, something wrong
# happened.
for conn in em.getClientList():
for conn in self.em.getClientList():
if conn.getAddress() == addr:
# primary master elected and connected
break
else:
raise ElectionFailure, 'no connection remains to ' \
'the primary'
raise ElectionFailure('No connection remains to the primary')
return
except ElectionFailure, m:
logging.error('election failed; %s' % m)
def _electionFailed(self, m):
"""
Ask other masters to reelect a primary after an election failure.
"""
logging.error('election failed: %s', (m, ))
# Ask all connected nodes to reelect a single primary master.
for conn in em.getClientList():
for conn in self.em.getClientList():
conn.notify(Packets.ReelectPrimary())
conn.abort()
......@@ -246,9 +280,9 @@ class Application(object):
self.primary = None
self.primary_master_node = None
t = time()
while em.getClientList():
while self.em.getClientList():
try:
em.poll(1)
self.em.poll(1)
except ElectionFailure:
pass
if time() > t + 10:
......@@ -256,11 +290,11 @@ class Application(object):
break
# Close all connections.
for conn in em.getClientList():
for conn in self.em.getClientList():
conn.close()
for conn in em.getServerList():
for conn in self.em.getServerList():
conn.close()
bootstrap = False
def broadcastNodesInformation(self, node_list):
"""
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment