Commit b2da69e2 authored by Kirill Smelkov's avatar Kirill Smelkov

go/neo/neonet: Demonstrate problem in handshake with NEO/py

Levin Zimmerman discovered that sometimes NEO/py accepts our handshake
hello with encoding 'M', then replies its owh handshake ehlo with
encoding 'N' and then further terminates the connection. In other words
it looks like that the handshake went successful, but it actually did
not and NEO/py terminates the link after some time.

This manifests itself e.g. as infrequent TestLoad failures on t branch
with the following output:

=== RUN   TestLoad/py/!ssl
I: runneo.py: /tmp/neo776618506/1 !ssl: started master(s): 127.0.0.1:21151
=== RUN   TestLoad/py/!ssl/enc=N(dialTryOrder=N,M)
    client_test.go:598: skip: does not excercise client redial
=== RUN   TestLoad/py/!ssl/enc=N(dialTryOrder=M,N)
    xtesting.go:330: load 0285cbac258bf266:0000000000000000: returned err unexpected:
        have: neo://127.0.0.1:21151/1: load 0285cbac258bf266:0000000000000000: dial S1: dial 127.0.0.1:40345 (STORAGE): 127.0.0.1:56678 - 127.0.0.1:40345: request identification: 127.0.0.1:56678 - 127.0.0.1:40345 .1: recv: EOF
        want: nil
    xtesting.go:330: load 0285cbac258bf266:0000000000000000: returned tid unexpected: 0000000000000000  ; want: 0285cbac258bf266
    xtesting.go:330: load 0285cbac258bf266:0000000000000000: returned buf = nil
    xtesting.go:339: load 0285cbac258bf265:0000000000000000: returned err unexpected:
        have: neo://127.0.0.1:21151/1: load 0285cbac258bf265:0000000000000000: dial S1: dial 127.0.0.1:40345 (STORAGE): 127.0.0.1:56688 - 127.0.0.1:40345: request identification: 127.0.0.1:56688 - 127.0.0.1:40345 .1: recv: EOF
        want: neo://127.0.0.1:21151/1: load 0285cbac258bf265:0000000000000000: 0000000000000000: object was not yet created
    ...
    client_test.go:588: NEO log tail:

        log file 'storage_0.log' tail:

        2023-07-17 17:21:57.1519 DEBUG     S1         connection completed for <ServerConnection(nid=None, address=127.0.0.1:51230, handler=IdentificationHandler, fd=20, server) at 7f3583fd4f50> (from 127.0.0.1:40345)
        2023-07-17 17:21:57.1537 WARNING   S1         Protocol version mismatch with <ServerConnection(nid=None, address=127.0.0.1:51230, handler=IdentificationHandler, fd=20, server) at 7f3583fd4f50>
        2023-07-17 17:21:57.1548 DEBUG     S1         connection closed for <ServerConnection(nid=None, address=127.0.0.1:51230, handler=IdentificationHandler, closed, server) at 7f3583fd4f50>
        2023-07-17 17:21:57.1551 WARNING   S1         A connection was lost during identification
        2023-07-17 17:22:00.1582 DEBUG     S1         accepted a connection from 127.0.0.1:51236
        2023-07-17 17:22:00.1585 DEBUG     S1         connection completed for <ServerConnection(nid=None, address=127.0.0.1:51236, handler=IdentificationHandler, fd=20, server) at 7f3583fd4e90> (from 127.0.0.1:40345)
        2023-07-17 17:22:00.1604 WARNING   S1         Protocol version mismatch with <ServerConnection(nid=None, address=127.0.0.1:51236, handler=IdentificationHandler, fd=20, server) at 7f3583fd4e90>
        2023-07-17 17:22:00.1622 DEBUG     S1         connection closed for <ServerConnection(nid=None, address=127.0.0.1:51236, handler=IdentificationHandler, closed, server) at 7f3583fd4e90>
        2023-07-17 17:22:00.1625 WARNING   S1         A connection was lost during identification
        2023-07-17 17:22:03.1663 DEBUG     S1         accepted a connection from 127.0.0.1:51238
        2023-07-17 17:22:03.1666 DEBUG     S1         connection completed for <ServerConnection(nid=None, address=127.0.0.1:51238, handler=IdentificationHandler, fd=20, server) at 7f3583fd4d10> (from 127.0.0.1:40345)
        2023-07-17 17:22:03.1674 WARNING   S1         Protocol version mismatch with <ServerConnection(nid=None, address=127.0.0.1:51238, handler=IdentificationHandler, fd=20, server) at 7f3583fd4d10>
        2023-07-17 17:22:03.1688 DEBUG     S1         connection closed for <ServerConnection(nid=None, address=127.0.0.1:51238, handler=IdentificationHandler, closed, server) at 7f3583fd4d10>
        2023-07-17 17:22:03.1691 WARNING   S1         A connection was lost during identification
        2023-07-17 17:22:06.1714 DEBUG     S1         accepted a connection from 127.0.0.1:57072
        2023-07-17 17:22:06.1719 DEBUG     S1         connection completed for <ServerConnection(nid=None, address=127.0.0.1:57072, handler=IdentificationHandler, fd=20, server) at 7f3583fd4b50> (from 127.0.0.1:40345)
        2023-07-17 17:22:06.1727 WARNING   S1         Protocol version mismatch with <ServerConnection(nid=None, address=127.0.0.1:57072, handler=IdentificationHandler, fd=20, server) at 7f3583fd4b50>
        2023-07-17 17:22:06.1738 DEBUG     S1         connection closed for <ServerConnection(nid=None, address=127.0.0.1:57072, handler=IdentificationHandler, closed, server) at 7f3583fd4b50>
        2023-07-17 17:22:06.1738 WARNING   S1         A connection was lost during identification

        log file 'master_0.log' tail:

        2023-07-17 17:21:21.0799 PACKET    M1         #0x0012 NotifyNodeInformation          > A1 (127.0.0.1:37906)
        2023-07-17 17:21:21.0799 PACKET    M1          ! C0 | CLIENT |  | RUNNING | 2023-07-17 14:21:21.079838
        2023-07-17 17:21:21.0800 PACKET    M1         #0x0102 NotifyNodeInformation          > S1 (127.0.0.1:37918)
        2023-07-17 17:21:21.0800 PACKET    M1          ! C0 | CLIENT |  | RUNNING | 2023-07-17 14:21:21.079838
        2023-07-17 17:21:21.0801 DEBUG     M1         Handler changed on <ServerConnection(nid=None, address=127.0.0.1:37966, handler=ClientServiceHandler, fd=18, server) at 7f3584245910>
        2023-07-17 17:21:21.0802 PACKET    M1         #0x0001 AnswerRequestIdentification    > C0 (127.0.0.1:37966)
        2023-07-17 17:21:21.0804 PACKET    M1         #0x0000 NotifyNodeInformation          > C0 (127.0.0.1:37966)
        2023-07-17 17:21:21.0804 PACKET    M1          ! C0 | CLIENT  |                 | RUNNING | 2023-07-17 14:21:21.079838
        2023-07-17 17:21:21.0804 PACKET    M1          ! M1 | MASTER  | 127.0.0.1:21151 | RUNNING | None
        2023-07-17 17:21:21.0804 PACKET    M1          ! S1 | STORAGE | 127.0.0.1:40345 | RUNNING | 2023-07-17 14:21:18.737469
        2023-07-17 17:21:21.0805 PACKET    M1         #0x0002 NotifyPartitionTable           > C0 (127.0.0.1:37966)
        2023-07-17 17:21:21.0810 PACKET    M1         #0x0003 LastTransaction                < C0 (127.0.0.1:37966)
        2023-07-17 17:21:21.0811 PACKET    M1         #0x0003 AnswerLastTransaction          > C0 (127.0.0.1:37966)
        2023-07-17 17:22:06.2053 DEBUG     M1         <SocketConnectorIPv4 at 0x7f3584252d10 fileno 18 ('127.0.0.1', 21151), opened from ('127.0.0.1', 37966)> closed in recv
        2023-07-17 17:22:06.2056 DEBUG     M1         connection closed for <ServerConnection(nid=C0, address=127.0.0.1:37966, handler=ClientServiceHandler, closed, server) at 7f3584245910>
        2023-07-17 17:22:06.2058 PACKET    M1         #0x0014 NotifyNodeInformation          > A1 (127.0.0.1:37906)
        2023-07-17 17:22:06.2058 PACKET    M1          ! C0 | CLIENT |  | UNKNOWN | 2023-07-17 14:21:21.079838
        2023-07-17 17:22:06.2059 PACKET    M1         #0x0104 NotifyNodeInformation          > S1 (127.0.0.1:37918)
        2023-07-17 17:22:06.2059 PACKET    M1          ! C0 | CLIENT |  | UNKNOWN | 2023-07-17 14:21:21.079838

The problem is due to that my analysis from e407f725 (go/neo/neonet:
Rework handshake to differentiate client and server parts) turned out to
be incorrect. Quoting that patch:

    -> Rework handshake so that client always sends its hello first, and
    only then the server side replies. This matches actual NEO/py behaviour:

    https://lab.nexedi.com/nexedi/neoppod/blob/v1.12-67-g261dd4b4/neo/lib/connector.py#L293-294

    even though the "NEO protocol" states that

    	Handshake transmissions are not ordered with respect to each other and can go in parallel.

    	( https://neo.nexedi.com/P-NEO-Protocol.Specification.2019?portal_skin=CI_slideshow#/9/2 )

    If I recall correctly that sentence was authored by me in 2018 based on
    previous understanding of should-be full symmetry in-between client and
    server.

so here "This matches actual NEO/py behaviour" was wrong: even though
https://lab.nexedi.com/nexedi/neoppod/blob/v1.12-67-g261dd4b4/neo/lib/connector.py#L293-294
indeed says that

    #       The NEO protocol is such that a client connection is always the
    #       first to send a packet, as soon as the connection is established,

in reality it is not the case as SocketConnector always queues handshake
hello upon its creation before receiving anything from remote side:
https://lab.nexedi.com/nexedi/neoppod/blob/v1.12-93-gfd87e153/neo/lib/connector.py#L77-78 .
In practice this leads to that in non-SSL case NEO/py server might be
fast enough to send its prepared hello before receiving hello from us.
Levin also explains at kirr/neo!5 (comment 187429):

    I think what happens is this:

    the NEO protocol doesn't specify in which order handshakes happen after
    initial dial. If the peer sends a handshake before receiving our
    handshake and if this peers handshake is received by us, 'DialLink'
    assumes everything is fine (no err is returned), it breaks the loop and
    returns the link. But then, very little time later, when the peer
    finally receives our handshake, this looks strange for the peer and it
    closes the connection.

    So in my understanding this should be fixed by explicitly comparing the
    encodings between our expected one and what the peer provided us. If
    encodings don't match we should retry with a new encoding in order to
    prevent the peer from closing the connection.

    For me this also explains why sometimes the tests passed and sometimes
    didn't: it depended on which node was faster ('race condition').

-> In this patch we add correspondig handshake test that demonstrates
this problem. It currently fails as

--- FAIL: TestHandshake (0.01s)
    --- FAIL: TestHandshake/enc=N (0.00s)
        newlink_test.go:154: handshake encoding mismatch: client: unexpected error:
            have: <nil>
                 "<nil>"
            want: &neonet._HandshakeError{LocalRole:1, LocalAddr:net.pipeAddr{}, RemoteAddr:net.pipeAddr{}, Err:(*neonet._EncodingMismatchError)(0xc0000a4190)}
                 "pipe - pipe: handshake (client): protocol encoding mismatch: peer = 'M'  ; our side = 'N'"
    --- FAIL: TestHandshake/enc=M (0.00s)
        newlink_test.go:154: handshake encoding mismatch: client: unexpected error:
            have: <nil>
                 "<nil>"
            want: &neonet._HandshakeError{LocalRole:1, LocalAddr:net.pipeAddr{}, RemoteAddr:net.pipeAddr{}, Err:(*neonet._EncodingMismatchError)(0xc0001a22cc)}
                 "pipe - pipe: handshake (client): protocol encoding mismatch: peer = 'N'  ; our side = 'M'"

We will fix it in the next patch.

/reported-by @levin.zimmermann
/reported-on kirr/neo!5
parent c100a0c9
...@@ -67,6 +67,12 @@ type _VersionMismatchError struct { ...@@ -67,6 +67,12 @@ type _VersionMismatchError struct {
RemoteVer uint32 RemoteVer uint32
} }
// _EncodingMismatchError is reported as cause when handshake detects that peer does not accept our encoding.
type _EncodingMismatchError struct {
LocalEnc proto.Encoding
RemoteEnc proto.Encoding
}
func (e *_HandshakeError) Error() string { func (e *_HandshakeError) Error() string {
role := "" role := ""
switch e.LocalRole { switch e.LocalRole {
...@@ -83,6 +89,9 @@ func (e *_HandshakeError) Unwrap() error { return e.Err } ...@@ -83,6 +89,9 @@ func (e *_HandshakeError) Unwrap() error { return e.Err }
func (e *_VersionMismatchError) Error() string { func (e *_VersionMismatchError) Error() string {
return fmt.Sprintf("protocol version mismatch: peer = %08x ; our side = %08x", e.RemoteVer, e.LocalVer) return fmt.Sprintf("protocol version mismatch: peer = %08x ; our side = %08x", e.RemoteVer, e.LocalVer)
} }
func (e *_EncodingMismatchError) Error() string {
return fmt.Sprintf("protocol encoding mismatch: peer = %q ; our side = %q", e.RemoteEnc, e.LocalEnc)
}
// handshakeClient implements client-side NEO protocol handshake just after raw // handshakeClient implements client-side NEO protocol handshake just after raw
// connection between 2 nodes was established. // connection between 2 nodes was established.
......
...@@ -110,9 +110,53 @@ func _TestHandshake(t *T) { ...@@ -110,9 +110,53 @@ func _TestHandshake(t *T) {
err2, estr(err2), e2ok, e2ok.Error()) err2, estr(err2), e2ok, e2ok.Error())
} }
// tx & rx problem (client) // encoding mismatch (mimic behaviour of NEO/py server who does not accept client-proposed encoding)
p1, p2 = net.Pipe() p1, p2 = net.Pipe()
var err error var err error
var srvEnc proto.Encoding
switch t.enc {
case 'N': srvEnc = 'M'
case 'M': srvEnc = 'N'
default: panic("bug")
}
wg = xsync.NewWorkGroup(bg)
gox(wg, func(ctx context.Context) {
_, _, err = _handshakeClient(ctx, p1, 1, t.enc)
})
gox(wg, func(ctx context.Context) {
wg := xsync.NewWorkGroup(ctx)
gox(wg, func(ctx context.Context) {
err := txHello("tx hello", p2, 1, srvEnc)
exc.Raiseif(err)
})
gox(wg, func(ctx context.Context) {
rxbuf := newXBufReader(p2, 1024)
_, _, err := rxHello("rx hello", rxbuf)
exc.Raiseif(err)
})
xwait(wg)
})
xwait(wg)
xclose(p1)
xclose(p2)
eok := &_HandshakeError{
LocalRole: _LinkClient,
LocalAddr: p1.LocalAddr(),
RemoteAddr: p1.RemoteAddr(),
Err: &_EncodingMismatchError{
LocalEnc: t.enc,
RemoteEnc: srvEnc,
},
}
if !reflect.DeepEqual(err, eok) {
t.Errorf("handshake encoding mismatch: client: unexpected error:\nhave: %#v\n %q\nwant: %#v\n %q",
err, estr(err), eok, eok.Error())
}
// tx & rx problem (client)
p1, p2 = net.Pipe()
wg = xsync.NewWorkGroup(bg) wg = xsync.NewWorkGroup(bg)
gox(wg, func(ctx context.Context) { gox(wg, func(ctx context.Context) {
_, _, err = _handshakeClient(ctx, p1, 1, t.enc) _, _, err = _handshakeClient(ctx, p1, 1, t.enc)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment