Commit a046d57d authored by Ursula Braun's avatar Ursula Braun Committed by David S. Miller

smc: CLC handshake (incl. preparation steps)

* CLC (Connection Layer Control) handshake
Signed-off-by: default avatarUrsula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 6812baab
obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC) += smc.o
smc-y := af_smc.o smc_pnet.o smc_ib.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o
...@@ -6,6 +6,13 @@ ...@@ -6,6 +6,13 @@
* offers an alternative communication option for TCP-protocol sockets * offers an alternative communication option for TCP-protocol sockets
* applicable with RoCE-cards only * applicable with RoCE-cards only
* *
* Initial restrictions:
* - non-blocking connect postponed
* - IPv6 support postponed
* - support for alternate links postponed
* - partial support for non-blocking sockets only
* - support for urgent data postponed
*
* Copyright IBM Corp. 2016 * Copyright IBM Corp. 2016
* *
* Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
...@@ -17,12 +24,18 @@ ...@@ -17,12 +24,18 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/socket.h> #include <linux/socket.h>
#include <linux/inetdevice.h>
#include <linux/workqueue.h>
#include <net/sock.h> #include <net/sock.h>
#include <net/tcp.h>
#include "smc.h" #include "smc.h"
#include "smc_clc.h"
#include "smc_ib.h" #include "smc_ib.h"
#include "smc_pnet.h" #include "smc_pnet.h"
static void smc_tcp_listen_work(struct work_struct *);
static void smc_set_keepalive(struct sock *sk, int val) static void smc_set_keepalive(struct sock *sk, int val)
{ {
struct smc_sock *smc = smc_sk(sk); struct smc_sock *smc = smc_sk(sk);
...@@ -88,9 +101,11 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) ...@@ -88,9 +101,11 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
sk->sk_state = SMC_INIT; sk->sk_state = SMC_INIT;
sk->sk_destruct = smc_destruct; sk->sk_destruct = smc_destruct;
sk->sk_protocol = SMCPROTO_SMC; sk->sk_protocol = SMCPROTO_SMC;
sk_refcnt_debug_inc(sk);
smc = smc_sk(sk); smc = smc_sk(sk);
INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
INIT_LIST_HEAD(&smc->accept_q);
spin_lock_init(&smc->accept_q_lock);
sk_refcnt_debug_inc(sk);
return sk; return sk;
} }
...@@ -184,6 +199,119 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) ...@@ -184,6 +199,119 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
} }
/* determine subnet and mask of internal TCP socket */
int smc_netinfo_by_tcpsk(struct socket *clcsock,
__be32 *subnet, u8 *prefix_len)
{
struct dst_entry *dst = sk_dst_get(clcsock->sk);
struct sockaddr_in addr;
int rc = -ENOENT;
int len;
if (!dst) {
rc = -ENOTCONN;
goto out;
}
if (!dst->dev) {
rc = -ENODEV;
goto out_rel;
}
/* get address to which the internal TCP socket is bound */
kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
/* analyze IPv4 specific data of net_device belonging to TCP socket */
for_ifa(dst->dev->ip_ptr) {
if (ifa->ifa_address != addr.sin_addr.s_addr)
continue;
*prefix_len = inet_mask_len(ifa->ifa_mask);
*subnet = ifa->ifa_address & ifa->ifa_mask;
rc = 0;
break;
} endfor_ifa(dst->dev->ip_ptr);
out_rel:
dst_release(dst);
out:
return rc;
}
/* setup for RDMA connection of client */
static int smc_connect_rdma(struct smc_sock *smc)
{
struct smc_clc_msg_accept_confirm aclc;
struct smc_ib_device *smcibdev;
int reason_code = 0;
int rc = 0;
u8 ibport;
/* IPSec connections opt out of SMC-R optimizations */
if (using_ipsec(smc)) {
reason_code = SMC_CLC_DECL_IPSEC;
goto decline_rdma;
}
/* PNET table look up: search active ib_device and port
* within same PNETID that also contains the ethernet device
* used for the internal TCP socket
*/
smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
if (!smcibdev) {
reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
goto decline_rdma;
}
/* do inband token exchange */
reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
if (reason_code < 0) {
rc = reason_code;
goto out_err;
}
if (reason_code > 0) /* configuration error */
goto decline_rdma;
/* receive SMC Accept CLC message */
reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
SMC_CLC_ACCEPT);
if (reason_code < 0) {
rc = reason_code;
goto out_err;
}
if (reason_code > 0)
goto decline_rdma;
/* tbd in follow-on patch: more steps to setup RDMA communcication,
* create connection, link group, link
*/
/* tbd in follow-on patch: more steps to setup RDMA communcication,
* create rmbs, map rmbs, rtoken_handling, modify_qp
*/
rc = smc_clc_send_confirm(smc);
if (rc)
goto out_err;
/* tbd in follow-on patch: llc_confirm */
out_connected:
smc_copy_sock_settings_to_clc(smc);
smc->sk.sk_state = SMC_ACTIVE;
return rc;
decline_rdma:
/* RDMA setup failed, switch back to TCP */
smc->use_fallback = true;
if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
rc = smc_clc_send_decline(smc, reason_code, 0);
if (rc < sizeof(struct smc_clc_msg_decline))
goto out_err;
}
goto out_connected;
out_err:
return rc;
}
static int smc_connect(struct socket *sock, struct sockaddr *addr, static int smc_connect(struct socket *sock, struct sockaddr *addr,
int alen, int flags) int alen, int flags)
{ {
...@@ -198,6 +326,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, ...@@ -198,6 +326,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
goto out_err; goto out_err;
if (addr->sa_family != AF_INET) if (addr->sa_family != AF_INET)
goto out_err; goto out_err;
smc->addr = addr; /* needed for nonblocking connect */
lock_sock(sk); lock_sock(sk);
switch (sk->sk_state) { switch (sk->sk_state) {
...@@ -216,12 +345,12 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, ...@@ -216,12 +345,12 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
if (rc) if (rc)
goto out; goto out;
sk->sk_state = SMC_ACTIVE; /* setup RDMA connection */
rc = smc_connect_rdma(smc);
/* always use TCP fallback as transport mechanism for now; if (rc < 0)
* This will change once RDMA transport is implemented goto out;
*/ else
smc->use_fallback = true; rc = 0; /* success cases including fallback */
out: out:
release_sock(sk); release_sock(sk);
...@@ -236,17 +365,32 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) ...@@ -236,17 +365,32 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
struct sock *new_sk; struct sock *new_sk;
int rc; int rc;
release_sock(&lsmc->sk);
new_sk = smc_sock_alloc(sock_net(sk), NULL); new_sk = smc_sock_alloc(sock_net(sk), NULL);
if (!new_sk) { if (!new_sk) {
rc = -ENOMEM; rc = -ENOMEM;
lsmc->sk.sk_err = ENOMEM; lsmc->sk.sk_err = ENOMEM;
*new_smc = NULL; *new_smc = NULL;
lock_sock(&lsmc->sk);
goto out; goto out;
} }
*new_smc = smc_sk(new_sk); *new_smc = smc_sk(new_sk);
rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
if (rc) { lock_sock(&lsmc->sk);
if (rc < 0) {
lsmc->sk.sk_err = -rc;
new_sk->sk_state = SMC_CLOSED;
sock_set_flag(new_sk, SOCK_DEAD);
sock_put(new_sk);
*new_smc = NULL;
goto out;
}
if (lsmc->sk.sk_state == SMC_CLOSED) {
if (new_clcsock)
sock_release(new_clcsock);
new_sk->sk_state = SMC_CLOSED;
sock_set_flag(new_sk, SOCK_DEAD);
sock_put(new_sk); sock_put(new_sk);
*new_smc = NULL; *new_smc = NULL;
goto out; goto out;
...@@ -257,6 +401,216 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) ...@@ -257,6 +401,216 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
return rc; return rc;
} }
/* add a just created sock to the accept queue of the listen sock as
* candidate for a following socket accept call from user space
*/
static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
{
struct smc_sock *par = smc_sk(parent);
sock_hold(sk);
spin_lock(&par->accept_q_lock);
list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
spin_unlock(&par->accept_q_lock);
sk_acceptq_added(parent);
}
/* remove a socket from the accept queue of its parental listening socket */
static void smc_accept_unlink(struct sock *sk)
{
struct smc_sock *par = smc_sk(sk)->listen_smc;
spin_lock(&par->accept_q_lock);
list_del_init(&smc_sk(sk)->accept_q);
spin_unlock(&par->accept_q_lock);
sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
sock_put(sk);
}
/* remove a sock from the accept queue to bind it to a new socket created
* for a socket accept call from user space
*/
static struct sock *smc_accept_dequeue(struct sock *parent,
struct socket *new_sock)
{
struct smc_sock *isk, *n;
struct sock *new_sk;
list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
new_sk = (struct sock *)isk;
smc_accept_unlink(new_sk);
if (new_sk->sk_state == SMC_CLOSED) {
/* tbd in follow-on patch: close this sock */
continue;
}
if (new_sock)
sock_graft(new_sk, new_sock);
return new_sk;
}
return NULL;
}
/* clean up for a created but never accepted sock */
static void smc_close_non_accepted(struct sock *sk)
{
struct smc_sock *smc = smc_sk(sk);
sock_hold(sk);
if (smc->clcsock) {
struct socket *tcp;
tcp = smc->clcsock;
smc->clcsock = NULL;
sock_release(tcp);
}
/* more closing stuff to be added with socket closing patch */
sock_put(sk);
}
/* setup for RDMA connection of server */
static void smc_listen_work(struct work_struct *work)
{
struct smc_sock *new_smc = container_of(work, struct smc_sock,
smc_listen_work);
struct socket *newclcsock = new_smc->clcsock;
struct smc_sock *lsmc = new_smc->listen_smc;
struct smc_clc_msg_accept_confirm cclc;
struct sock *newsmcsk = &new_smc->sk;
struct smc_clc_msg_proposal pclc;
struct smc_ib_device *smcibdev;
struct sockaddr_in peeraddr;
int reason_code = 0;
int rc = 0, len;
__be32 subnet;
u8 prefix_len;
u8 ibport;
/* do inband token exchange -
*wait for and receive SMC Proposal CLC message
*/
reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
SMC_CLC_PROPOSAL);
if (reason_code < 0)
goto out_err;
if (reason_code > 0)
goto decline_rdma;
/* IPSec connections opt out of SMC-R optimizations */
if (using_ipsec(new_smc)) {
reason_code = SMC_CLC_DECL_IPSEC;
goto decline_rdma;
}
/* PNET table look up: search active ib_device and port
* within same PNETID that also contains the ethernet device
* used for the internal TCP socket
*/
smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
if (!smcibdev) {
reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
goto decline_rdma;
}
/* determine subnet and mask from internal TCP socket */
rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
if (rc) {
reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
goto decline_rdma;
}
if ((pclc.outgoing_subnet != subnet) ||
(pclc.prefix_len != prefix_len)) {
reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
goto decline_rdma;
}
/* get address of the peer connected to the internal TCP socket */
kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
/* tbd in follow-on patch: more steps to setup RDMA communcication,
* create connection, link_group, link
*/
/* tbd in follow-on patch: more steps to setup RDMA communcication,
* create rmbs, map rmbs
*/
rc = smc_clc_send_accept(new_smc);
if (rc)
goto out_err;
/* receive SMC Confirm CLC message */
reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
SMC_CLC_CONFIRM);
if (reason_code < 0)
goto out_err;
if (reason_code > 0)
goto decline_rdma;
/* tbd in follow-on patch: more steps to setup RDMA communcication,
* rtoken_handling, modify_qp
*/
out_connected:
sk_refcnt_debug_inc(newsmcsk);
newsmcsk->sk_state = SMC_ACTIVE;
enqueue:
lock_sock(&lsmc->sk);
if (lsmc->sk.sk_state == SMC_LISTEN) {
smc_accept_enqueue(&lsmc->sk, newsmcsk);
} else { /* no longer listening */
smc_close_non_accepted(newsmcsk);
}
release_sock(&lsmc->sk);
/* Wake up accept */
lsmc->sk.sk_data_ready(&lsmc->sk);
sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
return;
decline_rdma:
/* RDMA setup failed, switch back to TCP */
new_smc->use_fallback = true;
if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
rc = smc_clc_send_decline(new_smc, reason_code, 0);
if (rc < sizeof(struct smc_clc_msg_decline))
goto out_err;
}
goto out_connected;
out_err:
newsmcsk->sk_state = SMC_CLOSED;
goto enqueue; /* queue new sock with sk_err set */
}
static void smc_tcp_listen_work(struct work_struct *work)
{
struct smc_sock *lsmc = container_of(work, struct smc_sock,
tcp_listen_work);
struct smc_sock *new_smc;
int rc = 0;
lock_sock(&lsmc->sk);
while (lsmc->sk.sk_state == SMC_LISTEN) {
rc = smc_clcsock_accept(lsmc, &new_smc);
if (rc)
goto out;
if (!new_smc)
continue;
new_smc->listen_smc = lsmc;
new_smc->use_fallback = false; /* assume rdma capability first*/
sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc);
schedule_work(&new_smc->smc_listen_work);
}
out:
release_sock(&lsmc->sk);
lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
}
static int smc_listen(struct socket *sock, int backlog) static int smc_listen(struct socket *sock, int backlog)
{ {
struct sock *sk = sock->sk; struct sock *sk = sock->sk;
...@@ -286,6 +640,8 @@ static int smc_listen(struct socket *sock, int backlog) ...@@ -286,6 +640,8 @@ static int smc_listen(struct socket *sock, int backlog)
sk->sk_max_ack_backlog = backlog; sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0; sk->sk_ack_backlog = 0;
sk->sk_state = SMC_LISTEN; sk->sk_state = SMC_LISTEN;
INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
schedule_work(&smc->tcp_listen_work);
out: out:
release_sock(sk); release_sock(sk);
...@@ -295,10 +651,11 @@ static int smc_listen(struct socket *sock, int backlog) ...@@ -295,10 +651,11 @@ static int smc_listen(struct socket *sock, int backlog)
static int smc_accept(struct socket *sock, struct socket *new_sock, static int smc_accept(struct socket *sock, struct socket *new_sock,
int flags) int flags)
{ {
struct smc_sock *new_smc; struct sock *sk = sock->sk, *nsk;
struct sock *sk = sock->sk; DECLARE_WAITQUEUE(wait, current);
struct smc_sock *lsmc; struct smc_sock *lsmc;
int rc; long timeo;
int rc = 0;
lsmc = smc_sk(sk); lsmc = smc_sk(sk);
lock_sock(sk); lock_sock(sk);
...@@ -308,18 +665,30 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, ...@@ -308,18 +665,30 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
goto out; goto out;
} }
rc = smc_clcsock_accept(lsmc, &new_smc); /* Wait for an incoming connection */
if (rc) timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
goto out; add_wait_queue_exclusive(sk_sleep(sk), &wait);
sock_graft(&new_smc->sk, new_sock); while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
new_smc->sk.sk_state = SMC_ACTIVE; set_current_state(TASK_INTERRUPTIBLE);
if (!timeo) {
smc_copy_sock_settings_to_smc(new_smc); rc = -EAGAIN;
break;
}
release_sock(sk);
timeo = schedule_timeout(timeo);
/* wakeup by sk_data_ready in smc_listen_work() */
sched_annotate_sleep();
lock_sock(sk);
if (signal_pending(current)) {
rc = sock_intr_errno(timeo);
break;
}
}
set_current_state(TASK_RUNNING);
remove_wait_queue(sk_sleep(sk), &wait);
/* always use TCP fallback as transport mechanism for now; if (!rc)
* This will change once RDMA transport is implemented rc = sock_error(nsk);
*/
new_smc->use_fallback = true;
out: out:
release_sock(sk); release_sock(sk);
...@@ -379,29 +748,61 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, ...@@ -379,29 +748,61 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
return rc; return rc;
} }
static unsigned int smc_accept_poll(struct sock *parent)
{
struct smc_sock *isk;
struct sock *sk;
lock_sock(parent);
list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
sk = (struct sock *)isk;
if (sk->sk_state == SMC_ACTIVE) {
release_sock(parent);
return POLLIN | POLLRDNORM;
}
}
release_sock(parent);
return 0;
}
static unsigned int smc_poll(struct file *file, struct socket *sock, static unsigned int smc_poll(struct file *file, struct socket *sock,
poll_table *wait) poll_table *wait)
{ {
struct sock *sk = sock->sk; struct sock *sk = sock->sk;
unsigned int mask = 0; unsigned int mask = 0;
struct smc_sock *smc; struct smc_sock *smc;
int rc;
smc = smc_sk(sock->sk); smc = smc_sk(sock->sk);
if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) || if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
smc->use_fallback) { /* delegate to CLC child sock */
mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
/* if non-blocking connect finished ... */ /* if non-blocking connect finished ... */
lock_sock(sk); lock_sock(sk);
if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) { if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
sk->sk_state = SMC_ACTIVE; sk->sk_err = smc->clcsock->sk->sk_err;
/* always use TCP fallback as transport mechanism; if (sk->sk_err) {
* This will change once RDMA transport is implemented mask |= POLLERR;
*/ } else {
smc->use_fallback = true; rc = smc_connect_rdma(smc);
if (rc < 0)
mask |= POLLERR;
else
/* success cases including fallback */
mask |= POLLOUT | POLLWRNORM;
}
} }
release_sock(sk); release_sock(sk);
} else { } else {
mask = sock_no_poll(file, sock, wait); sock_poll_wait(file, sk_sleep(sk), wait);
if (sk->sk_state == SMC_LISTEN)
/* woken up by sk_data_ready in smc_listen_work() */
mask |= smc_accept_poll(sk);
if (sk->sk_err)
mask |= POLLERR;
/* for now - to be enhanced in follow-on patch */
} }
return mask; return mask;
...@@ -568,6 +969,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, ...@@ -568,6 +969,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
/* create internal TCP socket for CLC handshake and fallback */ /* create internal TCP socket for CLC handshake and fallback */
smc = smc_sk(sk); smc = smc_sk(sk);
smc->use_fallback = false; /* assume rdma capability first */
rc = sock_create_kern(net, PF_INET, SOCK_STREAM, rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
IPPROTO_TCP, &smc->clcsock); IPPROTO_TCP, &smc->clcsock);
if (rc) if (rc)
......
...@@ -28,6 +28,12 @@ enum smc_state { /* possible states of an SMC socket */ ...@@ -28,6 +28,12 @@ enum smc_state { /* possible states of an SMC socket */
struct smc_sock { /* smc sock container */ struct smc_sock { /* smc sock container */
struct sock sk; struct sock sk;
struct socket *clcsock; /* internal tcp socket */ struct socket *clcsock; /* internal tcp socket */
struct sockaddr *addr; /* inet connect address */
struct smc_sock *listen_smc; /* listen parent */
struct work_struct tcp_listen_work;/* handle tcp socket accepts */
struct work_struct smc_listen_work;/* prepare new accept socket */
struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */
bool use_fallback; /* fallback to tcp */ bool use_fallback; /* fallback to tcp */
}; };
...@@ -40,4 +46,20 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) ...@@ -40,4 +46,20 @@ static inline struct smc_sock *smc_sk(const struct sock *sk)
extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
#ifdef CONFIG_XFRM
static inline bool using_ipsec(struct smc_sock *smc)
{
return (smc->clcsock->sk->sk_policy[0] ||
smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
}
#else
static inline bool using_ipsec(struct smc_sock *smc)
{
return 0;
}
#endif
int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
u8 *prefix_len);
#endif /* __SMC_H */ #endif /* __SMC_H */
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
* CLC (connection layer control) handshake over initial TCP socket to
* prepare for RDMA traffic
*
* Copyright IBM Corp. 2016
*
* Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
*/
#include <linux/in.h>
#include <net/sock.h>
#include <net/tcp.h>
#include "smc.h"
#include "smc_clc.h"
#include "smc_ib.h"
/* Wait for data on the tcp-socket, analyze received data
* Returns:
* 0 if success and it was not a decline that we received.
* SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
* clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
*/
int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
u8 expected_type)
{
struct sock *clc_sk = smc->clcsock->sk;
struct smc_clc_msg_hdr *clcm = buf;
struct msghdr msg = {NULL, 0};
int reason_code = 0;
struct kvec vec;
int len, datlen;
int krflags;
/* peek the first few bytes to determine length of data to receive
* so we don't consume any subsequent CLC message or payload data
* in the TCP byte stream
*/
vec.iov_base = buf;
vec.iov_len = buflen;
krflags = MSG_PEEK | MSG_WAITALL;
smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1,
sizeof(struct smc_clc_msg_hdr), krflags);
if (signal_pending(current)) {
reason_code = -EINTR;
clc_sk->sk_err = EINTR;
smc->sk.sk_err = EINTR;
goto out;
}
if (clc_sk->sk_err) {
reason_code = -clc_sk->sk_err;
smc->sk.sk_err = clc_sk->sk_err;
goto out;
}
if (!len) { /* peer has performed orderly shutdown */
smc->sk.sk_err = ECONNRESET;
reason_code = -ECONNRESET;
goto out;
}
if (len < 0) {
smc->sk.sk_err = -len;
reason_code = len;
goto out;
}
datlen = ntohs(clcm->length);
if ((len < sizeof(struct smc_clc_msg_hdr)) ||
(datlen < sizeof(struct smc_clc_msg_decline)) ||
(datlen > sizeof(struct smc_clc_msg_accept_confirm)) ||
memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
((clcm->type != SMC_CLC_DECLINE) &&
(clcm->type != expected_type))) {
smc->sk.sk_err = EPROTO;
reason_code = -EPROTO;
goto out;
}
/* receive the complete CLC message */
vec.iov_base = buf;
vec.iov_len = buflen;
memset(&msg, 0, sizeof(struct msghdr));
krflags = MSG_WAITALL;
smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
if (len < datlen) {
smc->sk.sk_err = EPROTO;
reason_code = -EPROTO;
goto out;
}
if (clcm->type == SMC_CLC_DECLINE)
reason_code = SMC_CLC_DECL_REPLY;
out:
return reason_code;
}
/* send CLC DECLINE message across internal TCP socket */
int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
u8 out_of_sync)
{
struct smc_clc_msg_decline dclc;
struct msghdr msg;
struct kvec vec;
int len;
memset(&dclc, 0, sizeof(dclc));
memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
dclc.hdr.type = SMC_CLC_DECLINE;
dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
dclc.hdr.version = SMC_CLC_V1;
dclc.hdr.flag = out_of_sync ? 1 : 0;
memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
dclc.peer_diagnosis = htonl(peer_diag_info);
memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
memset(&msg, 0, sizeof(msg));
vec.iov_base = &dclc;
vec.iov_len = sizeof(struct smc_clc_msg_decline);
len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
sizeof(struct smc_clc_msg_decline));
if (len < sizeof(struct smc_clc_msg_decline))
smc->sk.sk_err = EPROTO;
if (len < 0)
smc->sk.sk_err = -len;
return len;
}
/* send CLC PROPOSAL message across internal TCP socket */
int smc_clc_send_proposal(struct smc_sock *smc,
struct smc_ib_device *smcibdev,
u8 ibport)
{
struct smc_clc_msg_proposal pclc;
int reason_code = 0;
struct msghdr msg;
struct kvec vec;
int len, rc;
/* send SMC Proposal CLC message */
memset(&pclc, 0, sizeof(pclc));
memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
pclc.hdr.type = SMC_CLC_PROPOSAL;
pclc.hdr.length = htons(sizeof(pclc));
pclc.hdr.version = SMC_CLC_V1; /* SMC version */
memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1],
sizeof(smcibdev->mac[ibport - 1]));
/* determine subnet and mask from internal TCP socket */
rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
&pclc.prefix_len);
if (rc)
return SMC_CLC_DECL_CNFERR; /* configuration error */
memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
memset(&msg, 0, sizeof(msg));
vec.iov_base = &pclc;
vec.iov_len = sizeof(pclc);
/* due to the few bytes needed for clc-handshake this cannot block */
len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
if (len < sizeof(pclc)) {
if (len >= 0) {
reason_code = -ENETUNREACH;
smc->sk.sk_err = -reason_code;
} else {
smc->sk.sk_err = smc->clcsock->sk->sk_err;
reason_code = -smc->sk.sk_err;
}
}
return reason_code;
}
/* send CLC CONFIRM message across internal TCP socket */
int smc_clc_send_confirm(struct smc_sock *smc)
{
struct smc_clc_msg_accept_confirm cclc;
int reason_code = 0;
struct msghdr msg;
struct kvec vec;
int len;
/* send SMC Confirm CLC msg */
memset(&cclc, 0, sizeof(cclc));
memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
cclc.hdr.type = SMC_CLC_CONFIRM;
cclc.hdr.length = htons(sizeof(cclc));
cclc.hdr.version = SMC_CLC_V1; /* SMC version */
memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
/* tbd in follow-on patch: fill in link-related values */
/* tbd in follow-on patch: fill in rmb-related values */
cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
memset(&msg, 0, sizeof(msg));
vec.iov_base = &cclc;
vec.iov_len = sizeof(cclc);
len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
if (len < sizeof(cclc)) {
if (len >= 0) {
reason_code = -ENETUNREACH;
smc->sk.sk_err = -reason_code;
} else {
smc->sk.sk_err = smc->clcsock->sk->sk_err;
reason_code = -smc->sk.sk_err;
}
}
return reason_code;
}
/* send CLC ACCEPT message across internal TCP socket */
int smc_clc_send_accept(struct smc_sock *new_smc)
{
struct smc_clc_msg_accept_confirm aclc;
struct msghdr msg;
struct kvec vec;
int rc = 0;
int len;
memset(&aclc, 0, sizeof(aclc));
memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
aclc.hdr.type = SMC_CLC_ACCEPT;
aclc.hdr.length = htons(sizeof(aclc));
aclc.hdr.version = SMC_CLC_V1; /* SMC version */
memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
/* tbd in follow-on patch: fill in link-related values */
/* tbd in follow-on patch: fill in rmb-related values */
aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
memset(&msg, 0, sizeof(msg));
vec.iov_base = &aclc;
vec.iov_len = sizeof(aclc);
len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
if (len < sizeof(aclc)) {
if (len >= 0)
new_smc->sk.sk_err = EPROTO;
else
new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err;
rc = sock_error(&new_smc->sk);
}
return rc;
}
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
* CLC (connection layer control) handshake over initial TCP socket to
* prepare for RDMA traffic
*
* Copyright IBM Corp. 2016
*
* Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
*/
#ifndef _SMC_CLC_H
#define _SMC_CLC_H
#include <rdma/ib_verbs.h>
#include "smc.h"
#define SMC_CLC_PROPOSAL 0x01
#define SMC_CLC_ACCEPT 0x02
#define SMC_CLC_CONFIRM 0x03
#define SMC_CLC_DECLINE 0x04
/* eye catcher "SMCR" EBCDIC for CLC messages */
static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
#define SMC_CLC_V1 0x1 /* SMC version */
#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */
#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */
#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */
#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
struct smc_clc_msg_hdr { /* header1 of clc messages */
u8 eyecatcher[4]; /* eye catcher */
u8 type; /* proposal / accept / confirm / decline */
__be16 length;
#if defined(__BIG_ENDIAN_BITFIELD)
u8 version : 4,
flag : 1,
rsvd : 3;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
u8 rsvd : 3,
flag : 1,
version : 4;
#endif
} __packed; /* format defined in RFC7609 */
struct smc_clc_msg_trail { /* trailer of clc messages */
u8 eyecatcher[4];
};
struct smc_clc_msg_local { /* header2 of clc messages */
u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
u8 gid[16]; /* gid of ib_device port */
u8 mac[6]; /* mac of ib_device port */
};
struct smc_clc_msg_proposal { /* clc proposal message */
struct smc_clc_msg_hdr hdr;
struct smc_clc_msg_local lcl;
__be16 iparea_offset; /* offset to IP address information area */
__be32 outgoing_subnet; /* subnet mask */
u8 prefix_len; /* number of significant bits in mask */
u8 reserved[2];
u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
} __aligned(4);
struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
struct smc_clc_msg_hdr hdr;
struct smc_clc_msg_local lcl;
u8 qpn[3]; /* QP number */
__be32 rmb_rkey; /* RMB rkey */
u8 conn_idx; /* Connection index, which RMBE in RMB */
__be32 rmbe_alert_token;/* unique connection id */
#if defined(__BIG_ENDIAN_BITFIELD)
u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */
qp_mtu : 4; /* QP mtu */
#elif defined(__LITTLE_ENDIAN_BITFIELD)
u8 qp_mtu : 4,
rmbe_size : 4;
#endif
u8 reserved;
__be64 rmb_dma_addr; /* RMB virtual address */
u8 reserved2;
u8 psn[3]; /* initial packet sequence number */
struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
} __packed; /* format defined in RFC7609 */
struct smc_clc_msg_decline { /* clc decline message */
struct smc_clc_msg_hdr hdr;
u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
__be32 peer_diagnosis; /* diagnosis information */
u8 reserved2[4];
struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
} __aligned(4);
struct smc_sock;
struct smc_ib_device;
int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
u8 expected_type);
int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
u8 out_of_sync);
int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev,
u8 ibport);
int smc_clc_send_confirm(struct smc_sock *smc);
int smc_clc_send_accept(struct smc_sock *smc);
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment