New feature: monitoring

This task is done by the admin node, in 2 possible ways: - email notifications, as soon as some state change; - new 'neoctl print summary' command that can be used periodically to check the health of the database. They report the same information. About backup clusters: The admin of the main cluster also monitors selected backup clusters, with the help of their admin nodes. Internally, when a backup master node connects to the upstream master node, it receives the address of the upstream admin node and forwards it to its admin node, which is therefore able to connect to the upstream admin node. So the 2 admin nodes remain connected and communicate in 2 ways: - the backup node notifies upstream about the health of the backup cluster; - the upstream node queries the backup node periodically to check whether replication is not too late. TODO: A few things are hard-coded and we may want to configure them: - backup lateness is checked every 10 min; - backup is expected to never be late. There's also no delay to prevent 2 consecutive emails from having the same Date: (unfortunately, the RFC 5322 does not allow sub-second precision), in which case the MUA can display them in random order. This is mostly confusing when one notification is OK and the other is not, because one may wonder if there's a new problem.

New feature: monitoring
This task is done by the admin node, in 2 possible ways: - email notifications, as soon as some state change; - new 'neoctl print summary' command that can be used periodically to check the health of the database. They report the same information. About backup clusters: The admin of the main cluster also monitors selected backup clusters, with the help of their admin nodes. Internally, when a backup master node connects to the upstream master node, it receives the address of the upstream admin node and forwards it to its admin node, which is therefore able to connect to the upstream admin node. So the 2 admin nodes remain connected and communicate in 2 ways: - the backup node notifies upstream about the health of the backup cluster; - the upstream node queries the backup node periodically to check whether replication is not too late. TODO: A few things are hard-coded and we may want to configure them: - backup lateness is checked every 10 min; - backup is expected to never be late. There's also no delay to prevent 2 consecutive emails from having the same Date: (unfortunately, the RFC 5322 does not allow sub-second precision), in which case the MUA can display them in random order. This is mostly confusing when one notification is OK and the other is not, because one may wonder if there's a new problem.
e434c253 · Julien Muchembled · 82c142c4 · e434c253 · e434c253 · e434c253
Commit e434c253 authored May 31, 2019 by Julien Muchembled
18 changed files
--- a/neo/admin/app.py
+++ b/neo/admin/app.py
+# -*- coding: utf-8 -*-
 #
 # Copyright (C) 2006-2019  Nexedi SA
 #
@@ -14,17 +15,89 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

+import getpass, os, smtplib
+from collections import Counter
+from email.mime.text import MIMEText
+from email.utils import formataddr, formatdate
+from time import time
+from traceback import format_exc
 from neo.lib import logging
 from neo.lib.app import BaseApplication, buildOptionParser
-from neo.lib.connection import ListeningConnection
+from neo.lib.connection import ClientConnection, ListeningConnection, \
+    ConnectionClosed
 from neo.lib.exception import PrimaryFailure
-from .handler import AdminEventHandler, MasterEventHandler
+from .handler import AdminEventHandler, BackupHandler, MasterEventHandler, \
+    UpstreamAdminHandler, NOT_CONNECTED_MESSAGE
 from neo.lib.bootstrap import BootstrapManager
-from neo.lib.protocol import ClusterStates, Errors, NodeTypes, Packets
+from neo.lib.logger import INF
+from neo.lib.protocol import \
+    CellStates, ClusterStates, Errors, NodeTypes, Packets
 from neo.lib.debug import register as registerLiveDebugger
+from neo.lib.util import add64, datetimeFromTID, dump
+
+class Monitor(object):
+
+    def __init__(self):
+        self.down = 0
+        self.monitor_changed = False
+        self.pt_summary = None
+
+    def askLastIds(self, conn,
+            _askLastTransaction=Packets.AskLastTransaction(),
+            _askRecovery=Packets.AskRecovery()):
+        if self.cluster_state == ClusterStates.BACKINGUP:
+            conn.ask(_askRecovery)
+        conn.ask(_askLastTransaction)
+
+    @property
+    def operational(self):
+        return self.cluster_state in (ClusterStates.BACKINGUP,
+                                      ClusterStates.RUNNING)
+
+    @property
+    def severity(self):
+        return (2 if self.down or not self.operational else
+                1 if list(self.pt_summary) != [CellStates.UP_TO_DATE] or
+                     isinstance(self, Backup) and
+                     self.cluster_state != ClusterStates.BACKINGUP else
+                0)
+
+    def formatSummary(self, upstream=None):
+        summary = self.pt_summary
+        summary = '%s; %s' % (self.cluster_state,
+            ', '.join('%s=%s' % pt for pt in sorted(summary.iteritems()))
+            ) if summary else str(self.cluster_state)
+        if self.down:
+            summary += '; DOWN=%s' % self.down
+        if self.operational:
+            backup = self.cluster_state == ClusterStates.BACKINGUP
+            tid = self.backup_tid if backup else self.ltid
+            x = datetimeFromTID(tid)
+            if upstream and backup:
+                lag = (upstream[0] - x).total_seconds()
+                if lag or tid >= upstream[1]:
+                    lagging = self.max_lag < lag
+                else:
+                    lag = 'ε'
+                    lagging = self.max_lag <= 0
+                extra = '; lag=%s' % lag
+                if self.lagging != lagging:
+                    self.lagging = lagging
+                    self.monitor_changed = True
+            else:
+                extra = ' (%s)' % x
+            return (x, tid), '%s; ltid=%s%s' % (summary, dump(tid), extra)
+        return None, summary
+
+class Backup(Monitor):
+
+    cluster_state = None
+    conn = None
+    lagging = False
+    max_lag = 0

 @buildOptionParser
-class Application(BaseApplication):
+class Application(BaseApplication, Monitor):
    """The storage node application."""

    @classmethod
@@ -33,12 +106,19 @@ class Application(BaseApplication):
        _.description = "NEO Admin node"
        cls.addCommonServerOptions('admin', '127.0.0.1:9999')

+        hint = ' (the option can be repeated)'
        _ = _.group('admin')
+        _('monitor-email', multiple=True,
+            help='recipient email for notifications' + hint)
+        _('monitor-backup', multiple=True,
+            help='name of backup cluster to monitor' + hint)
+        _('smtp', metavar='HOST[:PORT]',
+            help='SMTP for email notifications')
        _.int('i', 'nid',
            help="specify an NID to use for this process (testing purpose)")

    def __init__(self, config):
-        super(Application, self).__init__(
+        BaseApplication.__init__(self,
            config.get('ssl'), config.get('dynamic_master_list'))
        for address in config['masters']:
            self.nm.createMaster(address=address)
@@ -46,6 +126,23 @@ class Application(BaseApplication):
        self.name = config['cluster']
        self.server = config['bind']

+        self.backup_dict = {x: Backup()
+            for x in config.get('monitor_backup', ())}
+        self.email_list = config.get('monitor_email', ())
+        if self.email_list:
+            self.smtp = smtplib.SMTP()
+            self.smtp_host = config.get('smtp') or 'localhost'
+            email_from = os.getenv('EMAIL')
+            if not email_from:
+              try:
+                email_from = getpass.getuser()
+              except Exception:
+                email_from = None
+            self.email_from = formataddr(("NEO " + self.name, email_from))
+        self.smtp_exc = None
+        self.smtp_retry = INF
+        self.notifying = set()
+
        logging.debug('IP address is %s, port is %d', *self.server)

        # The partition table is initialized after getting the number of
@@ -53,8 +150,11 @@ class Application(BaseApplication):
        self.pt = None
        self.uuid = config.get('nid')
        logging.node(self.name, self.uuid)
+        self.backup_handler = BackupHandler(self)
        self.master_event_handler = MasterEventHandler(self)
+        self.upstream_admin_handler = UpstreamAdminHandler(self)
        self.cluster_state = None
+        self.upstream_admin = self.upstream_admin_conn = None
        self.reset()
        registerLiveDebugger(on_log=self.log)

@@ -63,6 +163,8 @@ class Application(BaseApplication):
        super(Application, self).close()

    def reset(self):
+        Monitor.__init__(self)
+        self.asking_monitor_information = []
        self.master_conn = None
        self.master_node = None

@@ -112,13 +214,155 @@ class Application(BaseApplication):
        """
        self.cluster_state = None
        # search, find, connect and identify to the primary master
-        bootstrap = BootstrapManager(self, NodeTypes.ADMIN, self.server)
+        bootstrap = BootstrapManager(self, NodeTypes.ADMIN, self.server,
+                                     backup=list(self.backup_dict))
        self.master_node, self.master_conn = bootstrap.getPrimaryConnection()

        # passive handler
        self.master_conn.setHandler(self.master_event_handler)
        self.master_conn.ask(Packets.AskClusterState())

+    def connectToUpstreamAdmin(self):
+        if self.listening_conn: # if running
+            conn = self.upstream_admin_conn = ClientConnection(
+                self, self.upstream_admin_handler, self.upstream_admin)
+            conn.ask(Packets.RequestIdentification(NodeTypes.ADMIN,
+                None, None, self.name, None, {}))
+
+    def partitionTableUpdated(self):
+        pt = self.pt
+        if pt:
+            down_set = set()
+            pt_summary = Counter()
+            for offset in xrange(pt.np):
+                for cell in pt.getCellList(offset):
+                    node = cell.getNode()
+                    if not node.isRunning():
+                        down_set.add(node)
+                    pt_summary.update((cell.getState(),))
+            self.updateMonitorInformation(None,
+                down=len(down_set), pt_summary=dict(pt_summary))
+
+    def askMonitorInformation(self, conn):
+        asking = self.asking_monitor_information or self.notifying
+        self.asking_monitor_information.append((conn, conn.getPeerId()))
+        if not asking:
+            self._notify(self.operational)
+
+    def updateMonitorInformation(self, name, **kw):
+        monitor = self if name is None else self.backup_dict[name]
+        kw = {k: v for k, v in kw.iteritems() if v != getattr(monitor, k)}
+        if not kw:
+            return
+        monitor.monitor_changed = True
+        monitor.__dict__.update(kw)
+        if name is None and self.upstream_admin_conn:
+            self.upstream_admin_conn.send(Packets.NotifyMonitorInformation(kw))
+        if not self.notifying:
+            self.em.setTimeout(None, None)
+            self._notify(self.operational)
+
+    def _notify(self, ask_ids=True,
+                _askLastTransaction=Packets.AskLastTransaction(),
+                _askRecovery=Packets.AskRecovery()):
+        if ask_ids:
+            self.askLastIds(self.master_conn)
+            self.notifying = notifying = {None}
+            for name, monitor in self.backup_dict.iteritems():
+                if monitor.operational:
+                    monitor.askLastIds(monitor.conn)
+                    notifying.add(name)
+        if self.notifying or self.cluster_state is None is not self.master_conn:
+            return
+        severity = [], [], []
+        my_severity = self.severity
+        severity[my_severity].append(self.name)
+        changed = set()
+        if self.monitor_changed:
+            self.monitor_changed = False
+            changed.add(self.name)
+        if self.master_conn is None:
+            body = NOT_CONNECTED_MESSAGE
+        else:
+            upstream, body = self.formatSummary()
+            body = [body]
+            for name, backup in self.backup_dict.iteritems():
+                body += '', name, '    ' + backup.formatSummary(upstream)[1]
+                severity[backup.severity or backup.lagging].append(name)
+                if backup.monitor_changed:
+                    backup.monitor_changed = False
+                    changed.add(name)
+            body = '\n'.join(body)
+        if changed or self.smtp_retry < time():
+            logging.debug('monitor notification')
+            email_list = self.email_list
+            while email_list: # not a loop
+                msg = MIMEText(body + (self.smtp_exc or ''))
+                msg['Date'] = formatdate()
+                clusters, x = severity[1:]
+                while 1:
+                    if x:
+                        clusters = clusters + x
+                        x = 'PROBLEM'
+                    elif clusters:
+                        x = 'WARNING'
+                    else:
+                        x = 'OK'
+                        break
+                    clusters = changed.intersection(clusters)
+                    if clusters:
+                        x += ' (%s)' % ', '.join(sorted(clusters))
+                    break
+                msg['Subject'] = 'NEO monitoring: ' + x
+                msg['From'] = self.email_from
+                msg['To'] = ', '.join(email_list)
+                s = self.smtp
+                try:
+                    s.connect(self.smtp_host)
+                    s.sendmail(None, email_list, msg.as_string())
+                except Exception:
+                    x = format_exc()
+                    logging.error(x)
+                    if changed or not self.smtp_exc:
+                        self.smtp_exc = (
+                            "\n\nA notification could not be sent at %s:\n\n%s"
+                            % (msg['Date'], x))
+                    retry = self.smtp_retry = time() + 600
+                else:
+                    self.smtp_exc = None
+                    self.smtp_retry = INF
+                    if not (self.operational and any(monitor.operational
+                            for monitor in self.backup_dict.itervalues())):
+                        break
+                    retry = time() + 600
+                finally:
+                    s.close()
+                self.em.setTimeout(retry, self._notify)
+                break
+        neoctl = self.asking_monitor_information
+        if neoctl:
+            del severity[my_severity][0]
+            if self.smtp_exc:
+                my_severity = 2
+                body += self.smtp_exc
+            severity[1].sort()
+            severity[2].sort()
+            severity[my_severity].insert(0, None)
+            p = Packets.AnswerMonitorInformation(severity[1], severity[2], body)
+            for conn, msg_id in neoctl:
+                try:
+                    conn.send(p, msg_id)
+                except ConnectionClosed:
+                    pass
+            del self.asking_monitor_information[:]
+
+    def maybeNotify(self, name):
+        try:
+            self.notifying.remove(name)
+        except KeyError:
+            return
+        self._notify(False)
+
    def sendPartitionTable(self, conn, min_offset, max_offset, uuid):
        pt = self.pt
        if max_offset == 0:

--- a/neo/admin/handler.py
+++ b/neo/admin/handler.py
@@ -14,19 +14,19 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

-from neo.lib import logging, protocol
+from neo.lib import logging
 from neo.lib.handler import EventHandler
-from neo.lib.protocol import uuid_str, Packets
+from neo.lib.protocol import uuid_str, \
+    NodeTypes, NotReadyError, Packets, ProtocolError
 from neo.lib.pt import PartitionTable
 from neo.lib.exception import PrimaryFailure

+NOT_CONNECTED_MESSAGE = 'Not connected to a primary master.'
+
 def AdminEventHandlerType(name, bases, d):
-    def check_primary_master(func):
-        def wrapper(self, *args, **kw):
-            if self.app.master_conn is not None:
-                return func(self, *args, **kw)
-            raise protocol.NotReadyError('Not connected to a primary master.')
-        return wrapper
+    def check_connection(func):
+        return lambda self, conn, *args, **kw: \
+            self._checkConnection(conn) and func(self, conn, *args, **kw)

    def forward_ask(klass):
        return lambda self, conn, *args: self.app.master_conn.ask(
@@ -47,7 +47,7 @@ def AdminEventHandlerType(name, bases, d):
            Packets.TweakPartitionTable,
        ):
        d[x.handler_method_name] = forward_ask(x)
-    return type(name, bases, {k: v if k[0] == '_' else check_primary_master(v)
+    return type(name, bases, {k: v if k[0] == '_' else check_connection(v)
                              for k, v in d.iteritems()})

 class AdminEventHandler(EventHandler):
@@ -55,6 +55,26 @@ class AdminEventHandler(EventHandler):

    __metaclass__ = AdminEventHandlerType

+    def _checkConnection(self, conn):
+        if self.app.master_conn is None:
+            raise NotReadyError(NOT_CONNECTED_MESSAGE)
+        return True
+
+    def requestIdentification(self, conn, node_type, uuid, address, name, *_):
+        if node_type != NodeTypes.ADMIN:
+            raise ProtocolError("reject non-admin node")
+        app = self.app
+        try:
+            backup = app.backup_dict[name]
+        except KeyError:
+            raise ProtocolError("unknown backup cluster %r" % name)
+        if backup.conn is not None:
+            raise ProtocolError("already connected")
+        backup.conn = conn
+        conn.setHandler(app.backup_handler)
+        conn.answer(Packets.AcceptIdentification(
+            NodeTypes.ADMIN, None, None))
+
    def askPartitionList(self, conn, min_offset, max_offset, uuid):
        logging.info("ask partition list from %s to %s for %s",
                     min_offset, max_offset, uuid_str(uuid))
@@ -83,6 +103,9 @@ class AdminEventHandler(EventHandler):
        self.app.master_conn.send(Packets.FlushLog())
        super(AdminEventHandler, self).flushLog(conn)

+    def askMonitorInformation(self, conn):
+        self.app.askMonitorInformation(conn)
+

 class MasterEventHandler(EventHandler):
    """ This class is just used to dispatch message to right handler"""
@@ -104,13 +127,93 @@ class MasterEventHandler(EventHandler):
            forward.send(packet, kw['msg_id'])

    def answerClusterState(self, conn, state):
-        self.app.cluster_state = state
+        self.app.updateMonitorInformation(None, cluster_state=state)

    notifyClusterInformation = answerClusterState

    def sendPartitionTable(self, conn, ptid, num_replicas, row_list):
-        pt = self.app.pt = object.__new__(PartitionTable)
-        pt.load(ptid, num_replicas, row_list, self.app.nm)
+        app = self.app
+        app.pt = object.__new__(PartitionTable)
+        app.pt.load(ptid, num_replicas, row_list, app.nm)
+        app.partitionTableUpdated()

    def notifyPartitionChanges(self, conn, ptid, num_replicas, cell_list):
-        self.app.pt.update(ptid, num_replicas, cell_list, self.app.nm)
+        app = self.app
+        app.pt.update(ptid, num_replicas, cell_list, app.nm)
+        app.partitionTableUpdated()
+
+    def notifyNodeInformation(self, *args):
+        super(MasterEventHandler, self).notifyNodeInformation(*args)
+        self.app.partitionTableUpdated()
+
+    def notifyUpstreamAdmin(self, conn, addr):
+        app = self.app
+        node = app.upstream_admin
+        if node is None:
+            node = app.upstream_admin = app.nm.createAdmin()
+        elif node.getAddress() == addr:
+            return
+        node.setAddress(addr)
+        if app.upstream_admin_conn:
+            app.upstream_admin_conn.close()
+        else:
+            app.connectToUpstreamAdmin()
+
+    def answerLastTransaction(self, conn, ltid):
+        app = self.app
+        app.ltid = ltid
+        app.maybeNotify(None)
+
+    def answerRecovery(self, name, ptid, backup_tid, truncate_tid):
+        self.app.backup_tid = backup_tid
+
+def monitor(func):
+    def wrapper(self, conn, *args, **kw):
+        for name, backup in self.app.backup_dict.iteritems():
+            if backup.conn is conn:
+                return func(self, name, *args, **kw)
+        raise AssertionError
+    return wrapper
+
+class BackupHandler(EventHandler):
+
+    @monitor
+    def connectionClosed(self, name):
+        app = self.app
+        app.backup_dict[name] = app.backup_dict[name].__class__()
+        app.maybeNotify(name)
+
+    @monitor
+    def notifyMonitorInformation(self, name, info):
+        self.app.updateMonitorInformation(name, **info)
+
+    @monitor
+    def answerRecovery(self, name, ptid, backup_tid, truncate_tid):
+        self.app.backup_dict[name].backup_tid = backup_tid
+
+    @monitor
+    def answerLastTransaction(self, name, ltid):
+        app = self.app
+        app.backup_dict[name].ltid = ltid
+        app.maybeNotify(name)
+
+
+class UpstreamAdminHandler(AdminEventHandler):
+
+    def _checkConnection(self, conn):
+        assert conn is self.app.upstream_admin_conn
+        return super(UpstreamAdminHandler, self)._checkConnection(conn)
+
+    def connectionClosed(self, conn):
+        app = self.app
+        if conn is app.upstream_admin_conn:
+            app.connectToUpstreamAdmin()
+
+    connectionFailed = connectionClosed
+
+    def _acceptIdentification(self, node):
+        node.send(Packets.NotifyMonitorInformation({
+            'cluster_state': self.app.cluster_state,
+            'down': self.app.down,
+            'pt_summary': self.app.pt_summary,
+            }))
--- a/neo/lib/config.py
+++ b/neo/lib/config.py
@@ -18,6 +18,15 @@ import argparse, os, sys
 from functools import wraps
 from ConfigParser import SafeConfigParser

+class _DefaultList(list):
+    """
+    Special list type for default values of 'append' argparse actions,
+    so that the parser restarts from an empty list when the option is
+    used on the command-line.
+    """
+
+    def __copy__(self):
+        return []

 class _Required(object):

@@ -30,6 +39,8 @@ class _Required(object):

 class _Option(object):

+    multiple = False
+
    def __init__(self, *args, **kw):
        if len(args) > 1:
            self.short, self.name = args
@@ -51,7 +62,12 @@ class _Option(object):
            action.required = _Required(option_list, self.name)

    def fromConfigFile(self, cfg, section):
-        return self(cfg.get(section, self.name.replace('-', '_')))
+        value = cfg.get(section, self.name.replace('-', '_'))
+        if self.multiple:
+            return [self(value)
+                for value in value.splitlines()
+                if value]
+        return self(value)

    @staticmethod
    def parse(value):
@@ -81,6 +97,11 @@ class Option(_Option):
                kw[x] = getattr(self, x)
            except AttributeError:
                pass
+        if self.multiple:
+            kw['action'] = 'append'
+            default = kw.get('default')
+            if default:
+              kw['default'] = _DefaultList(default)
        return kw

    @staticmethod
@@ -132,9 +153,6 @@ class OptionGroup(object):

 class Argument(Option):

-    def __init__(self, name, **kw):
-        super(Argument, self).__init__(name, **kw)
-
    def _asArgparse(self, parser, option_list):
        kw = {'help': self.help, 'type': self}
        for x in 'default', 'metavar', 'nargs', 'choices':

--- a/neo/lib/protocol.py
+++ b/neo/lib/protocol.py
@@ -826,6 +826,18 @@ class Packets(dict):
        :nodes: ctl -> A -> M -> *
        """)

+    AskMonitorInformation, AnswerMonitorInformation = request("""
+        :nodes: ctl -> A
+        """)
+
+    NotifyMonitorInformation = notify("""
+        :nodes: A -> A
+        """)
+
+    NotifyUpstreamAdmin = notify("""
+        :nodes: M -> A
+        """)
+
    del notify, request



--- a/neo/lib/util.py
+++ b/neo/lib/util.py
@@ -39,7 +39,8 @@ nextafter()

 TID_LOW_OVERFLOW = 2**32
 TID_LOW_MAX = TID_LOW_OVERFLOW - 1
-SECOND_PER_TID_LOW = 60.0 / TID_LOW_OVERFLOW
+SECOND_FROM_UINT32 = 60. / TID_LOW_OVERFLOW
+MICRO_FROM_UINT32 = 1e6 / TID_LOW_OVERFLOW
 TID_CHUNK_RULES = (
    (-1900, 0),
    (-1, 12),
@@ -52,7 +53,7 @@ def tidFromTime(tm):
    gmt = gmtime(tm)
    return packTID(
        (gmt.tm_year, gmt.tm_mon, gmt.tm_mday, gmt.tm_hour, gmt.tm_min),
-        int((gmt.tm_sec + (tm - int(tm))) / SECOND_PER_TID_LOW))
+        int((gmt.tm_sec + (tm - int(tm))) / SECOND_FROM_UINT32))

 def packTID(higher, lower):
    """
@@ -95,15 +96,10 @@ def unpackTID(ptid):
    higher.reverse()
    return (tuple(higher), lower)

-def timeStringFromTID(ptid):
-    """
-    Return a string in the format "yyyy-mm-dd hh:mm:ss.ssssss" from a TID
-    """
-    higher, lower = unpackTID(ptid)
-    seconds = lower * SECOND_PER_TID_LOW
-
-    return '%04d-%02d-%02d %02d:%02d:%09.6f' % (higher[0], higher[1], higher[2],
-                                                higher[3], higher[4], seconds)
+def datetimeFromTID(tid):
+    higher, lower = unpackTID(tid)
+    seconds, lower = divmod(lower * 60, TID_LOW_OVERFLOW)
+    return datetime(*(higher + (seconds, int(lower * MICRO_FROM_UINT32))))

 def addTID(ptid, offset):
    """

--- a/neo/master/app.py
+++ b/neo/master/app.py
@@ -182,12 +182,15 @@ class Application(BaseApplication):
            self.playPrimaryRole()
            self.playSecondaryRole()

-    def getNodeInformationDict(self, node_list):
+    def getNodeInformationGetter(self, node_list):
        node_dict = defaultdict(list)
+        admin_dict = defaultdict(list)
        # group modified nodes by destination node type
        for node in node_list:
            node_info = node.asTuple()
            if node.isAdmin():
+                for backup in node.extra.get('backup', ()):
+                    admin_dict[backup].append(node_info)
                continue
            node_dict[NodeTypes.ADMIN].append(node_info)
            node_dict[NodeTypes.STORAGE].append(node_info)
@@ -197,18 +200,27 @@ class Application(BaseApplication):
            if node.isStorage():
                continue
            node_dict[NodeTypes.MASTER].append(node_info)
-        return node_dict
+        def getNodeListFor(node):
+            node_list = node_dict.get(node.getType())
+            if node.isClient():
+                admin_list = admin_dict.get(node.extra.get('backup'))
+                if admin_list:
+                    if node_list:
+                        return node_list + admin_list
+                    return admin_list
+            return node_list
+        return getNodeListFor

    def broadcastNodesInformation(self, node_list):
        """
          Broadcast changes for a set a nodes
          Send only one packet per connection to reduce bandwidth
        """
-        node_dict = self.getNodeInformationDict(node_list)
+        getNodeListFor = self.getNodeInformationGetter(node_list)
        now = monotonic_time()
        # send at most one non-empty notification packet per node
        for node in self.nm.getIdentifiedList():
-            node_list = node_dict.get(node.getType())
+            node_list = getNodeListFor(node)
            # We don't skip pending storage nodes because we don't send them
            # the full list of nodes when they're added, and it's also quite
            # useful to notify them about new masters.

--- a/neo/master/backup_app.py
+++ b/neo/master/backup_app.py
@@ -99,7 +99,8 @@ class BackupApplication(object):
        pt = app.pt
        while True:
            app.changeClusterState(ClusterStates.STARTING_BACKUP)
-            bootstrap = BootstrapManager(self, NodeTypes.CLIENT)
+            bootstrap = BootstrapManager(self, NodeTypes.CLIENT,
+                                         backup=app.name)
            # {offset -> node}
            self.primary_partition_dict = {}
            # [[tid]]
@@ -367,3 +368,9 @@ class BackupApplication(object):
                    uuid_str(cell.getUUID()), offset,
                    dump(tid), uuid_str(node.getUUID()))
                cell.getNode().send(p)
+
+    def notifyUpstreamAdmin(self, addr):
+        node_list = self.app.nm.getAdminList(only_identified=True)
+        if node_list:
+            min(node_list, key=lambda node: node.getUUID()).send(
+                Packets.NotifyUpstreamAdmin(addr))
--- a/neo/master/handlers/__init__.py
+++ b/neo/master/handlers/__init__.py
@@ -52,7 +52,7 @@ class MasterHandler(EventHandler):
        node_list = app.nm.getList()
        node_list.remove(node)
        node_list = ([node.asTuple()] # for id_timestamp
-            + app.getNodeInformationDict(node_list)[node.getType()])
+            + app.getNodeInformationGetter(node_list)(node))
        conn.send(Packets.NotifyNodeInformation(monotonic_time(), node_list))

    def handlerSwitched(self, conn, new):

--- a/neo/master/handlers/administration.py
+++ b/neo/master/handlers/administration.py
@@ -58,6 +58,12 @@ class AdministrationHandler(MasterHandler):
    def handlerSwitched(self, conn, new):
        assert new
        super(AdministrationHandler, self).handlerSwitched(conn, new)
+        app = self.app.backup_app
+        if app is not None:
+            for node in app.nm.getAdminList():
+                if node.isRunning():
+                    app.notifyUpstreamAdmin(node.getAddress())
+                    break

    def connectionLost(self, conn, new_state):
        node = self.app.nm.getByUUID(conn.getUUID())

--- a/neo/master/handlers/backup.py
+++ b/neo/master/handlers/backup.py
@@ -16,7 +16,7 @@

 from neo.lib.exception import PrimaryFailure
 from neo.lib.handler import EventHandler
-from neo.lib.protocol import ZERO_TID
+from neo.lib.protocol import NodeTypes, NodeStates, Packets, ZERO_TID
 from neo.lib.pt import PartitionTable

 class BackupHandler(EventHandler):
@@ -36,6 +36,13 @@ class BackupHandler(EventHandler):
    def notifyPartitionChanges(self, conn, ptid, num_replicas, cell_list):
        self.app.pt.update(ptid, num_replicas, cell_list, self.app.nm)

+    def notifyNodeInformation(self, conn, timestamp, node_list):
+        super(BackupHandler, self).notifyNodeInformation(
+            conn, timestamp, node_list)
+        for node_type, addr, _, state, _ in node_list:
+            if node_type == NodeTypes.ADMIN and state == NodeStates.RUNNING:
+                self.app.notifyUpstreamAdmin(addr)
+
    def answerLastTransaction(self, conn, tid):
        app = self.app
        prev_tid = app.app.getLastTransaction()

--- a/neo/neoctl/app.py
+++ b/neo/neoctl/app.py
@@ -14,11 +14,11 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

-import sys
+import json, sys
 from .neoctl import NeoCTL, NotReadyException
 from neo.lib.node import NodeManager
 from neo.lib.pt import PartitionTable
-from neo.lib.util import p64, u64, tidFromTime, timeStringFromTID
+from neo.lib.util import p64, u64, datetimeFromTID, tidFromTime
 from neo.lib.protocol import uuid_str, formatNodeList, \
    ClusterStates, NodeStates, NodeTypes, UUID_NAMESPACES, ZERO_TID

@@ -29,6 +29,7 @@ action_dict = {
        'node': 'getNodeList',
        'cluster': 'getClusterState',
        'primary': 'getPrimary',
+        'summary': 'getSummary',
    },
    'set': {
        'cluster': 'setClusterState',
@@ -100,12 +101,12 @@ class TerminalNeoCTL(object):
        if backup_tid:
            ltid = self.neoctl.getLastTransaction()
            r = "backup_tid = 0x%x (%s)" % (u64(backup_tid),
-                                            timeStringFromTID(backup_tid))
+                                            datetimeFromTID(backup_tid))
        else:
            loid, ltid = self.neoctl.getLastIds()
            r = "last_oid = 0x%x" % (u64(loid))
        return r + "\nlast_tid = 0x%x (%s)\nlast_ptid = %s" % \
-                                    (u64(ltid), timeStringFromTID(ltid), ptid)
+                                    (u64(ltid), datetimeFromTID(ltid), ptid)

    def getPartitionRowList(self, params):
        """
@@ -159,6 +160,21 @@ class TerminalNeoCTL(object):
        assert len(params) == 1
        return self.neoctl.setClusterState(self.asClusterState(params[0]))

+    def getSummary(self, params):
+        """
+          Get a summary of the health of this cluster and backups.
+          The first line reports severities: it is a commented json dump of
+            {severity: [backup_name | null]}
+          where severity is either "warning" or "problem"
+            and null refers to this cluster
+        """
+        assert len(params) == 0
+        warning, problem, summary = self.neoctl.getMonitorInformation()
+        return "# %s\n%s" % (json.dumps({k: v for k, v in zip(
+            ('warning', 'problem'),
+            (warning, problem),
+            ) if v}), summary)
+
    def setNumReplicas(self, params):
        """
          Set number of replicas.

--- a/neo/neoctl/handler.py
+++ b/neo/neoctl/handler.py
@@ -64,3 +64,4 @@ class CommandEventHandler(EventHandler):
    answerLastTransaction = __answer(Packets.AnswerLastTransaction)
    answerRecovery = __answer(Packets.AnswerRecovery)
    answerTweakPartitionTable = __answer(Packets.AnswerTweakPartitionTable)
+    answerMonitorInformation = __answer(Packets.AnswerMonitorInformation)
--- a/neo/neoctl/neoctl.py
+++ b/neo/neoctl/neoctl.py
@@ -216,3 +216,9 @@ class NeoCTL(BaseApplication):
        conn.send(Packets.FlushLog())
        while conn.pending():
            self.em.poll(1)
+
+    def getMonitorInformation(self):
+        response = self.__ask(Packets.AskMonitorInformation())
+        if response[0] != Packets.AnswerMonitorInformation:
+            raise RuntimeError(response)
+        return response[1:]
--- a/neo/tests/functional/testCluster.py
+++ b/neo/tests/functional/testCluster.py
@@ -14,12 +14,21 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

+from functools import partial
 import unittest
 import transaction
 from neo.lib.protocol import NodeStates
-
+from neo.neoctl.app import TerminalNeoCTL
 from . import NEOCluster, NEOFunctionalTest

+class TerminalNeoCTL(TerminalNeoCTL):
+
+    def __init__(self, cluster):
+        self.neoctl = cluster.neoctl
+
+    def __del__(self):
+        pass
+
 class ClusterTests(NEOFunctionalTest):

    def _tearDown(self, success):
@@ -118,12 +127,20 @@ class ClusterTests(NEOFunctionalTest):
        self.neo.start()
        self.neo.expectClusterRunning()
        self.neo.expectOudatedCells(0)
+        # check neoctl cli
+        getSummary = partial(TerminalNeoCTL(self.neo).getSummary, ())
+        ok_empty = '# {}\nRUNNING;' \
+            ' UP_TO_DATE=1; ltid=0000000000000000 (1900-01-01 00:00:00)'
+        self.assertEqual(getSummary(), ok_empty)
        # connect a client a check it's known
        db, conn = self.neo.getZODBConnection()
        self.assertEqual(len(self.neo.getClientlist()), 1)
        # drop the storage, the cluster is no more operational...
        self.neo.getStorageProcessList()[0].stop()
        self.neo.expectClusterRecovering()
+        # check severity returned by the cli
+        self.assertEqual(getSummary(),
+            '# {"problem": [null]}\nRECOVERING; UP_TO_DATE=1; DOWN=1')
        # ...and the client gets disconnected
        self.assertEqual(len(self.neo.getClientlist()), 0)
        # restart storage so that the cluster is operational again
@@ -134,6 +151,9 @@ class ClusterTests(NEOFunctionalTest):
        conn.root()['plop'] = 1
        transaction.commit()
        self.assertEqual(len(self.neo.getClientlist()), 1)
+        summary = getSummary()
+        self.assertTrue(summary.startswith('# {}\nRUNNING;'), summary)
+        self.assertNotEqual(summary, ok_empty)

    def testStorageLostDuringRecovery(self):
        """

--- a/neo/tests/protocol
+++ b/neo/tests/protocol
@@ -16,6 +16,7 @@ AnswerInformationLocked(p64)
 AnswerLastIDs(?p64,?p64)
 AnswerLastTransaction(p64)
 AnswerLockedTransactions({p64:?p64})
+AnswerMonitorInformation([?bin],[?bin],bin)
 AnswerNewOIDs([p64])
 AnswerNodeList([(NodeTypes,?(bin,int),?int,NodeStates,?float)])
 AnswerObject(p64,p64,?p64,?int,bin,bin,?p64)
@@ -50,6 +51,7 @@ AskLastIDs()
 AskLastTransaction()
 AskLockInformation(p64,p64)
 AskLockedTransactions()
+AskMonitorInformation()
 AskNewOIDs(int)
 AskNodeList(NodeTypes)
 AskObject(p64,?p64,?p64)
@@ -77,6 +79,7 @@ InvalidateObjects(p64,[p64])
 NotPrimaryMaster(?int,[(bin,int)])
 NotifyClusterInformation(ClusterStates)
 NotifyDeadlock(p64,p64)
+NotifyMonitorInformation({bin:any})
 NotifyNodeInformation(float,[(NodeTypes,?(bin,int),?int,NodeStates,?float)])
 NotifyPartitionChanges(int,int,[(int,int,CellStates)])
 NotifyPartitionCorrupted(int,[int])
@@ -85,6 +88,7 @@ NotifyRepair(bool)
 NotifyReplicationDone(int,p64)
 NotifyTransactionFinished(p64,p64)
 NotifyUnlockInformation(p64)
+NotifyUpstreamAdmin((bin,int))
 Ping()
 Pong()
 Repair([int],bool)

--- a/neo/tests/threaded/__init__.py
+++ b/neo/tests/threaded/__init__.py
@@ -20,6 +20,7 @@ import os, random, select, socket, sys, tempfile
 import thread, threading, time, traceback, weakref
 from collections import deque
 from contextlib import contextmanager
+from email import message_from_string
 from itertools import count
 from functools import partial, wraps
 from zlib import decompress
@@ -301,6 +302,14 @@ class TestSerialized(Serialized):
        return self._epoll.poll(timeout)


+class FakeSMTP(list):
+
+    close = connect = lambda *_: None
+
+    def sendmail(self, *args):
+        self.append(args)
+
+
 class Node(object):

    def getConnectionList(self, *peers):
@@ -421,7 +430,11 @@ class ServerNode(Node):
        self.em.wakeup(thread.exit)

 class AdminApplication(ServerNode, neo.admin.app.Application):
-    pass
+
+    def __setattr__(self, name, value):
+        if name == 'smtp':
+            value = FakeSMTP()
+        super(AdminApplication, self).__setattr__(name, value)

 class MasterApplication(ServerNode, neo.master.app.Application):
    pass
@@ -691,6 +704,9 @@ class NEOCluster(object):
        self._resource_dict[result] = self
        return result[1]

+    def _allocateName(self, _new=lambda: random.randint(0, 100)):
+        return 'neo_%s' % self._allocate('name', _new)
+
    @staticmethod
    def _patch():
        cls = NEOCluster
@@ -717,10 +733,10 @@ class NEOCluster(object):
    def __init__(self, master_count=1, partitions=1, replicas=0, upstream=None,
                       adapter=os.getenv('NEO_TESTS_ADAPTER', 'SQLite'),
                       storage_count=None, db_list=None, clear_databases=True,
-                       compress=True,
+                       compress=True, backup_count=0,
                       importer=None, autostart=None, dedup=False, name=None):
-        self.name = name or 'neo_%s' % self._allocate('name',
-            lambda: random.randint(0, 100))
+        self.name = name or self._allocateName()
+        self.backup_list = [self._allocateName() for x in xrange(backup_count)]
        self.compress = compress
        self.num_partitions = partitions
        master_list = [MasterApplication.newAddress()
@@ -759,6 +775,9 @@ class NEOCluster(object):
        kw['wait'] = 0
        self.storage_list = [StorageApplication(database=db(x), **kw)
                             for x in db_list]
+        kw['monitor_email'] = self.name,
+        if backup_count:
+            kw['monitor_backup'] = self.backup_list
        self.admin_list = [AdminApplication(**kw)]

    def __repr__(self):
@@ -1133,6 +1152,23 @@ class NEOThreadedTest(NeoTestBase):
        ob._p_activate()
        ob._p_jar.readCurrent(ob)

+    def assertNoMonitorInformation(self, cluster):
+        self.assertFalse(cluster.admin.smtp)
+
+    def assertMonitor(self, cluster, severity, summary, *backups):
+        msg = message_from_string(cluster.admin.smtp.pop(0)[2])
+        self.assertIn(('OK', 'WARNING', 'PROBLEM')[severity], msg['subject'])
+        msg = msg.get_payload().splitlines()
+        def assertStartsWith(a, b):
+            self.assertTrue(a.startswith(b), (a, b))
+        assertStartsWith(msg.pop(0), summary)
+        expected = {k.name: v for k, v in backups}
+        while msg:
+            self.assertFalse(msg.pop(0))
+            x = expected.pop(msg.pop(0))
+            assertStartsWith(msg.pop(0), '    %s' % x)
+        self.assertFalse(expected)
+

 class ThreadId(list):


--- a/neo/tests/threaded/testReplication.py
+++ b/neo/tests/threaded/testReplication.py
+# -*- coding: utf-8 -*-
 #
 # Copyright (C) 2012-2019  Nexedi SA
 #
@@ -41,10 +42,14 @@ from .test import PCounter, PCounterWithResolution # XXX
 def backup_test(partitions=1, upstream_kw={}, backup_kw={}):
    def decorator(wrapped):
        def wrapper(self):
-            with NEOCluster(partitions=partitions, **upstream_kw) as upstream:
+            with NEOCluster(partitions=partitions, backup_count=1,
+                            **upstream_kw) as upstream:
                upstream.start()
+                name, = upstream.backup_list
                with NEOCluster(partitions=partitions, upstream=upstream,
-                                **backup_kw) as backup:
+                                name=name, **backup_kw) as backup:
+                    self.assertMonitor(upstream, 2, 'RECOVERING',
+                                       (backup, None))
                    backup.start()
                    backup.neoctl.setClusterState(ClusterStates.STARTING_BACKUP)
                    self.tic()
@@ -321,6 +326,10 @@ class ReplicationTests(NEOThreadedTest):
            delay = f.delayNotifyUnlockInformation()
            t1.commit()
            self.tic()
+            warning, problem, msg = upstream.neoctl.getMonitorInformation()
+            self.assertEqual(warning, (backup.name,))
+            self.assertFalse(problem)
+            self.assertTrue(msg.endswith('lag=ε'), msg)
            def storeObject(orig, *args, **kw):
                p.revert()
                f.remove(delay)
@@ -331,6 +340,10 @@ class ReplicationTests(NEOThreadedTest):
        t1.begin()
        self.assertEqual(5, ob.value)
        self.assertEqual(1, self.checkBackup(backup))
+        warning, problem, msg = upstream.neoctl.getMonitorInformation()
+        self.assertFalse(warning)
+        self.assertFalse(problem)
+        self.assertTrue(msg.endswith('lag=0.0'), msg)

    @with_cluster()
    def testBackupEarlyInvalidation(self, upstream):
@@ -761,6 +774,22 @@ class ReplicationTests(NEOThreadedTest):
    @backup_test(2, backup_kw=dict(replicas=1))
    def testResumingBackupReplication(self, backup):
        upstream = backup.upstream
+        for monitor in 'RECOVERING', 'VERIFYING', 'RUNNING':
+            monitor += '; UP_TO_DATE=2'
+            self.assertMonitor(upstream, 2, monitor, (backup, None))
+        self.assertMonitor(upstream, 0, monitor,
+                           (backup, 'BACKINGUP; UP_TO_DATE=4;'))
+        def checkMonitor():
+            self.assertMonitor(upstream, 2, monitor,
+                (backup, 'BACKINGUP; OUT_OF_DATE=2, UP_TO_DATE=2; DOWN=1;'))
+            self.assertNoMonitorInformation(upstream)
+            warning, problem, _ = upstream.neoctl.getMonitorInformation()
+            self.assertFalse(warning)
+            self.assertEqual(problem, (backup.name,))
+            warning, problem, _ = backup.neoctl.getMonitorInformation()
+            self.assertFalse(warning)
+            self.assertEqual(problem, (None,))
+
        t, c = upstream.getTransaction()
        r = c.root()
        r[1] = PCounter()
@@ -789,11 +818,18 @@ class ReplicationTests(NEOThreadedTest):
                return x.pop(conn.getUUID(), 1)
            newTransaction()
            self.assertEqual(getBackupTid(), tids[1])
+            self.assertNoMonitorInformation(upstream)
            primary.stop()
            backup.join((primary,))
            primary.resetNode()
+            checkMonitor()
            primary.start()
            self.tic()
+            self.assertMonitor(upstream, 1, monitor,
+                (backup, 'BACKINGUP; OUT_OF_DATE=2, UP_TO_DATE=2; ltid='))
+            warning, problem, _ = backup.neoctl.getMonitorInformation()
+            self.assertEqual(warning, (None,))
+            self.assertFalse(problem)
            primary, slave = slave, primary
            self.assertEqual(tids, getTIDList(slave))
            self.assertEqual(tids[:1], getTIDList(primary))
@@ -803,6 +839,11 @@ class ReplicationTests(NEOThreadedTest):
        self.assertEqual(4, self.checkBackup(backup))
        self.assertEqual(getBackupTid(min), tids[1])

+        self.assertMonitor(upstream, 1, monitor,
+            (backup, 'BACKINGUP; OUT_OF_DATE=1, UP_TO_DATE=3; ltid='))
+        self.assertMonitor(upstream, 0, monitor,
+                           (backup, 'BACKINGUP; UP_TO_DATE=4;'))
+
        # Check that replication resumes from the maximum possible tid
        # (for UP_TO_DATE cells of a backup cluster). More precisely:
        # - cells are handled independently (done here by blocking replication
@@ -811,6 +852,7 @@ class ReplicationTests(NEOThreadedTest):
        #   we interrupt replication of obj in the middle of a transaction)
        slave.stop()
        backup.join((slave,))
+        checkMonitor()
        ask = []
        def delayReplicate(conn, packet):
            if isinstance(packet, Packets.AskFetchObjects):
@@ -820,16 +862,28 @@ class ReplicationTests(NEOThreadedTest):
                return
            ask.append(packet._args)
        conn, = upstream.master.getConnectionList(backup.master)
+        admins = upstream.admin, backup.admin
        with ConnectionFilter() as f, Patch(replicator.Replicator,
                _nextPartitionSortKey=lambda orig, self, offset: offset):
            f.add(delayReplicate)
-            delayReconnect = f.delayAskLastTransaction()
+            delayReconnect = f.delayAskLastTransaction(lambda conn:
+                self.getConnectionApp(conn) not in admins)
+            # Without the following delay, the upstream admin may be notified
+            # that the backup is back in BACKINGUP state before getting the
+            # last tid (from the upstream master); note that in such case,
+            # we would have 2 consecutive identical notifications.
+            delayMonitor = f.delayNotifyMonitorInformation(
+                lambda _, x=iter((0,)): next(x, 1))
            conn.close()
            newTransaction()
+            self.assertMonitor(upstream, 2, monitor, (backup,
+                'STARTING_BACKUP; OUT_OF_DATE=2, UP_TO_DATE=2; DOWN=1'))
+            f.remove(delayMonitor)
            newTransaction()
+            checkMonitor()
            newTransaction()
            self.assertFalse(ask)
-            self.assertEqual(f.filtered_count, 1)
+            self.assertEqual(f.filtered_count, 2)
            with Patch(replicator, FETCH_COUNT=1):
                f.remove(delayReconnect)
                self.tic()
@@ -859,6 +913,7 @@ class ReplicationTests(NEOThreadedTest):
            ])
        self.tic()
        self.assertEqual(2, self.checkBackup(backup))
+        checkMonitor()

    @with_cluster(start_cluster=0, replicas=1)
    def testStoppingDuringReplication(self, cluster):

--- a/tools/stress
+++ b/tools/stress
@@ -17,7 +17,7 @@ from neo.lib.connector import SocketConnector
 from neo.lib.debug import PdbSocket
 from neo.lib.node import Node
 from neo.lib.protocol import NodeTypes
-from neo.lib.util import timeStringFromTID, p64, u64
+from neo.lib.util import datetimeFromTID, p64, u64
 from neo.storage.app import DATABASE_MANAGER_DICT, \
    Application as StorageApplication
 from neo.tests import getTempDirectory, mysql_pool
@@ -533,7 +533,7 @@ class Application(StressApplication):
        ltid = self.ltid
        stdscr.addstr(y, 0,
            'last oid: 0x%x\nlast tid: 0x%x (%s)\nclients: '
-            % (u64(self.loid), u64(ltid), timeStringFromTID(ltid)))
+            % (u64(self.loid), u64(ltid), datetimeFromTID(ltid)))
        before = after = 0
        for i, p in enumerate(self.cluster.process_dict[Client]):
            if i:
@@ -708,7 +708,7 @@ def main():
                        ok = tid
                    finally:
                        conn.close()
-                print('bad: 0x%x (%s)' % (u64(bad), timeStringFromTID(bad)))
+                print('bad: 0x%x (%s)' % (u64(bad), datetimeFromTID(bad)))
        finally:
            db.close()
    finally: