Commit cf0ac2b8 authored by David S. Miller's avatar David S. Miller

Merge branch 'for-davem' of git://oss.oracle.com/git/agrover/linux-2.6

parents f27e21a8 905d64c8
...@@ -302,6 +302,7 @@ header-y += quota.h ...@@ -302,6 +302,7 @@ header-y += quota.h
header-y += radeonfb.h header-y += radeonfb.h
header-y += random.h header-y += random.h
header-y += raw.h header-y += raw.h
header-y += rds.h
header-y += reboot.h header-y += reboot.h
header-y += reiserfs_fs.h header-y += reiserfs_fs.h
header-y += reiserfs_xattr.h header-y += reiserfs_xattr.h
......
...@@ -73,6 +73,10 @@ ...@@ -73,6 +73,10 @@
#define RDS_CMSG_RDMA_MAP 3 #define RDS_CMSG_RDMA_MAP 3
#define RDS_CMSG_RDMA_STATUS 4 #define RDS_CMSG_RDMA_STATUS 4
#define RDS_CMSG_CONG_UPDATE 5 #define RDS_CMSG_CONG_UPDATE 5
#define RDS_CMSG_ATOMIC_FADD 6
#define RDS_CMSG_ATOMIC_CSWP 7
#define RDS_CMSG_MASKED_ATOMIC_FADD 8
#define RDS_CMSG_MASKED_ATOMIC_CSWP 9
#define RDS_INFO_FIRST 10000 #define RDS_INFO_FIRST 10000
#define RDS_INFO_COUNTERS 10000 #define RDS_INFO_COUNTERS 10000
...@@ -89,9 +93,9 @@ ...@@ -89,9 +93,9 @@
#define RDS_INFO_LAST 10010 #define RDS_INFO_LAST 10010
struct rds_info_counter { struct rds_info_counter {
u_int8_t name[32]; uint8_t name[32];
u_int64_t value; uint64_t value;
} __packed; } __attribute__((packed));
#define RDS_INFO_CONNECTION_FLAG_SENDING 0x01 #define RDS_INFO_CONNECTION_FLAG_SENDING 0x01
#define RDS_INFO_CONNECTION_FLAG_CONNECTING 0x02 #define RDS_INFO_CONNECTION_FLAG_CONNECTING 0x02
...@@ -100,56 +104,48 @@ struct rds_info_counter { ...@@ -100,56 +104,48 @@ struct rds_info_counter {
#define TRANSNAMSIZ 16 #define TRANSNAMSIZ 16
struct rds_info_connection { struct rds_info_connection {
u_int64_t next_tx_seq; uint64_t next_tx_seq;
u_int64_t next_rx_seq; uint64_t next_rx_seq;
__be32 laddr; __be32 laddr;
__be32 faddr; __be32 faddr;
u_int8_t transport[TRANSNAMSIZ]; /* null term ascii */ uint8_t transport[TRANSNAMSIZ]; /* null term ascii */
u_int8_t flags; uint8_t flags;
} __packed; } __attribute__((packed));
struct rds_info_flow {
__be32 laddr;
__be32 faddr;
u_int32_t bytes;
__be16 lport;
__be16 fport;
} __packed;
#define RDS_INFO_MESSAGE_FLAG_ACK 0x01 #define RDS_INFO_MESSAGE_FLAG_ACK 0x01
#define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02
struct rds_info_message { struct rds_info_message {
u_int64_t seq; uint64_t seq;
u_int32_t len; uint32_t len;
__be32 laddr; __be32 laddr;
__be32 faddr; __be32 faddr;
__be16 lport; __be16 lport;
__be16 fport; __be16 fport;
u_int8_t flags; uint8_t flags;
} __packed; } __attribute__((packed));
struct rds_info_socket { struct rds_info_socket {
u_int32_t sndbuf; uint32_t sndbuf;
__be32 bound_addr; __be32 bound_addr;
__be32 connected_addr; __be32 connected_addr;
__be16 bound_port; __be16 bound_port;
__be16 connected_port; __be16 connected_port;
u_int32_t rcvbuf; uint32_t rcvbuf;
u_int64_t inum; uint64_t inum;
} __packed; } __attribute__((packed));
struct rds_info_tcp_socket { struct rds_info_tcp_socket {
__be32 local_addr; __be32 local_addr;
__be16 local_port; __be16 local_port;
__be32 peer_addr; __be32 peer_addr;
__be16 peer_port; __be16 peer_port;
u_int64_t hdr_rem; uint64_t hdr_rem;
u_int64_t data_rem; uint64_t data_rem;
u_int32_t last_sent_nxt; uint32_t last_sent_nxt;
u_int32_t last_expected_una; uint32_t last_expected_una;
u_int32_t last_seen_una; uint32_t last_seen_una;
} __packed; } __attribute__((packed));
#define RDS_IB_GID_LEN 16 #define RDS_IB_GID_LEN 16
struct rds_info_rdma_connection { struct rds_info_rdma_connection {
...@@ -203,42 +199,69 @@ struct rds_info_rdma_connection { ...@@ -203,42 +199,69 @@ struct rds_info_rdma_connection {
* (so that the application does not have to worry about * (so that the application does not have to worry about
* alignment). * alignment).
*/ */
typedef u_int64_t rds_rdma_cookie_t; typedef uint64_t rds_rdma_cookie_t;
struct rds_iovec { struct rds_iovec {
u_int64_t addr; uint64_t addr;
u_int64_t bytes; uint64_t bytes;
}; };
struct rds_get_mr_args { struct rds_get_mr_args {
struct rds_iovec vec; struct rds_iovec vec;
u_int64_t cookie_addr; uint64_t cookie_addr;
uint64_t flags; uint64_t flags;
}; };
struct rds_get_mr_for_dest_args { struct rds_get_mr_for_dest_args {
struct sockaddr_storage dest_addr; struct sockaddr_storage dest_addr;
struct rds_iovec vec; struct rds_iovec vec;
u_int64_t cookie_addr; uint64_t cookie_addr;
uint64_t flags; uint64_t flags;
}; };
struct rds_free_mr_args { struct rds_free_mr_args {
rds_rdma_cookie_t cookie; rds_rdma_cookie_t cookie;
u_int64_t flags; uint64_t flags;
}; };
struct rds_rdma_args { struct rds_rdma_args {
rds_rdma_cookie_t cookie; rds_rdma_cookie_t cookie;
struct rds_iovec remote_vec; struct rds_iovec remote_vec;
u_int64_t local_vec_addr; uint64_t local_vec_addr;
u_int64_t nr_local; uint64_t nr_local;
u_int64_t flags; uint64_t flags;
u_int64_t user_token; uint64_t user_token;
};
struct rds_atomic_args {
rds_rdma_cookie_t cookie;
uint64_t local_addr;
uint64_t remote_addr;
union {
struct {
uint64_t compare;
uint64_t swap;
} cswp;
struct {
uint64_t add;
} fadd;
struct {
uint64_t compare;
uint64_t swap;
uint64_t compare_mask;
uint64_t swap_mask;
} m_cswp;
struct {
uint64_t add;
uint64_t nocarry_mask;
} m_fadd;
};
uint64_t flags;
uint64_t user_token;
}; };
struct rds_rdma_notify { struct rds_rdma_notify {
u_int64_t user_token; uint64_t user_token;
int32_t status; int32_t status;
}; };
...@@ -257,5 +280,6 @@ struct rds_rdma_notify { ...@@ -257,5 +280,6 @@ struct rds_rdma_notify {
#define RDS_RDMA_USE_ONCE 0x0008 /* free MR after use */ #define RDS_RDMA_USE_ONCE 0x0008 /* free MR after use */
#define RDS_RDMA_DONTWAIT 0x0010 /* Don't wait in SET_BARRIER */ #define RDS_RDMA_DONTWAIT 0x0010 /* Don't wait in SET_BARRIER */
#define RDS_RDMA_NOTIFY_ME 0x0020 /* Notify when operation completes */ #define RDS_RDMA_NOTIFY_ME 0x0020 /* Notify when operation completes */
#define RDS_RDMA_SILENT 0x0040 /* Do not interrupt remote */
#endif /* IB_RDS_H */ #endif /* IB_RDS_H */
...@@ -39,7 +39,15 @@ ...@@ -39,7 +39,15 @@
#include <net/sock.h> #include <net/sock.h>
#include "rds.h" #include "rds.h"
#include "rdma.h"
char *rds_str_array(char **array, size_t elements, size_t index)
{
if ((index < elements) && array[index])
return array[index];
else
return "unknown";
}
EXPORT_SYMBOL(rds_str_array);
/* this is just used for stats gathering :/ */ /* this is just used for stats gathering :/ */
static DEFINE_SPINLOCK(rds_sock_lock); static DEFINE_SPINLOCK(rds_sock_lock);
...@@ -62,7 +70,7 @@ static int rds_release(struct socket *sock) ...@@ -62,7 +70,7 @@ static int rds_release(struct socket *sock)
struct rds_sock *rs; struct rds_sock *rs;
unsigned long flags; unsigned long flags;
if (sk == NULL) if (!sk)
goto out; goto out;
rs = rds_sk_to_rs(sk); rs = rds_sk_to_rs(sk);
...@@ -73,7 +81,15 @@ static int rds_release(struct socket *sock) ...@@ -73,7 +81,15 @@ static int rds_release(struct socket *sock)
* with the socket. */ * with the socket. */
rds_clear_recv_queue(rs); rds_clear_recv_queue(rs);
rds_cong_remove_socket(rs); rds_cong_remove_socket(rs);
/*
* the binding lookup hash uses rcu, we need to
* make sure we sychronize_rcu before we free our
* entry
*/
rds_remove_bound(rs); rds_remove_bound(rs);
synchronize_rcu();
rds_send_drop_to(rs, NULL); rds_send_drop_to(rs, NULL);
rds_rdma_drop_keys(rs); rds_rdma_drop_keys(rs);
rds_notify_queue_get(rs, NULL); rds_notify_queue_get(rs, NULL);
...@@ -83,6 +99,8 @@ static int rds_release(struct socket *sock) ...@@ -83,6 +99,8 @@ static int rds_release(struct socket *sock)
rds_sock_count--; rds_sock_count--;
spin_unlock_irqrestore(&rds_sock_lock, flags); spin_unlock_irqrestore(&rds_sock_lock, flags);
rds_trans_put(rs->rs_transport);
sock->sk = NULL; sock->sk = NULL;
sock_put(sk); sock_put(sk);
out: out:
...@@ -514,7 +532,7 @@ static void rds_sock_info(struct socket *sock, unsigned int len, ...@@ -514,7 +532,7 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
spin_unlock_irqrestore(&rds_sock_lock, flags); spin_unlock_irqrestore(&rds_sock_lock, flags);
} }
static void __exit rds_exit(void) static void rds_exit(void)
{ {
sock_unregister(rds_family_ops.family); sock_unregister(rds_family_ops.family);
proto_unregister(&rds_proto); proto_unregister(&rds_proto);
...@@ -529,7 +547,7 @@ static void __exit rds_exit(void) ...@@ -529,7 +547,7 @@ static void __exit rds_exit(void)
} }
module_exit(rds_exit); module_exit(rds_exit);
static int __init rds_init(void) static int rds_init(void)
{ {
int ret; int ret;
......
...@@ -34,45 +34,52 @@ ...@@ -34,45 +34,52 @@
#include <net/sock.h> #include <net/sock.h>
#include <linux/in.h> #include <linux/in.h>
#include <linux/if_arp.h> #include <linux/if_arp.h>
#include <linux/jhash.h>
#include "rds.h" #include "rds.h"
/* #define BIND_HASH_SIZE 1024
* XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
* particularly zippy.
*
* This is now called for every incoming frame so we arguably care much more
* about it than we used to.
*/
static DEFINE_SPINLOCK(rds_bind_lock); static DEFINE_SPINLOCK(rds_bind_lock);
static struct rb_root rds_bind_tree = RB_ROOT;
static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
struct rds_sock *insert) {
return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
(BIND_HASH_SIZE - 1));
}
static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
struct rds_sock *insert)
{ {
struct rb_node **p = &rds_bind_tree.rb_node;
struct rb_node *parent = NULL;
struct rds_sock *rs; struct rds_sock *rs;
struct hlist_node *node;
struct hlist_head *head = hash_to_bucket(addr, port);
u64 cmp; u64 cmp;
u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
while (*p) { rcu_read_lock();
parent = *p; hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) {
rs = rb_entry(parent, struct rds_sock, rs_bound_node);
cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
be16_to_cpu(rs->rs_bound_port); be16_to_cpu(rs->rs_bound_port);
if (needle < cmp) if (cmp == needle) {
p = &(*p)->rb_left; rcu_read_unlock();
else if (needle > cmp)
p = &(*p)->rb_right;
else
return rs; return rs;
}
} }
rcu_read_unlock();
if (insert) { if (insert) {
rb_link_node(&insert->rs_bound_node, parent, p); /*
rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); * make sure our addr and port are set before
* we are added to the list, other people
* in rcu will find us as soon as the
* hlist_add_head_rcu is done
*/
insert->rs_bound_addr = addr;
insert->rs_bound_port = port;
rds_sock_addref(insert);
hlist_add_head_rcu(&insert->rs_bound_node, head);
} }
return NULL; return NULL;
} }
...@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, ...@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
struct rds_sock *rds_find_bound(__be32 addr, __be16 port) struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
{ {
struct rds_sock *rs; struct rds_sock *rs;
unsigned long flags;
spin_lock_irqsave(&rds_bind_lock, flags); rs = rds_bind_lookup(addr, port, NULL);
rs = rds_bind_tree_walk(addr, port, NULL);
if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
rds_sock_addref(rs); rds_sock_addref(rs);
else else
rs = NULL; rs = NULL;
spin_unlock_irqrestore(&rds_bind_lock, flags);
rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
ntohs(port)); ntohs(port));
...@@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) ...@@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
do { do {
if (rover == 0) if (rover == 0)
rover++; rover++;
if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
*port = cpu_to_be16(rover); *port = rs->rs_bound_port;
ret = 0; ret = 0;
rdsdebug("rs %p binding to %pI4:%d\n",
rs, &addr, (int)ntohs(*port));
break; break;
} }
} while (rover++ != last); } while (rover++ != last);
if (ret == 0) {
rs->rs_bound_addr = addr;
rs->rs_bound_port = *port;
rds_sock_addref(rs);
rdsdebug("rs %p binding to %pI4:%d\n",
rs, &addr, (int)ntohs(*port));
}
spin_unlock_irqrestore(&rds_bind_lock, flags); spin_unlock_irqrestore(&rds_bind_lock, flags);
return ret; return ret;
...@@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs) ...@@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs)
rs, &rs->rs_bound_addr, rs, &rs->rs_bound_addr,
ntohs(rs->rs_bound_port)); ntohs(rs->rs_bound_port));
rb_erase(&rs->rs_bound_node, &rds_bind_tree); hlist_del_init_rcu(&rs->rs_bound_node);
rds_sock_put(rs); rds_sock_put(rs);
rs->rs_bound_addr = 0; rs->rs_bound_addr = 0;
} }
...@@ -184,7 +182,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ...@@ -184,7 +182,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out; goto out;
trans = rds_trans_get_preferred(sin->sin_addr.s_addr); trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
if (trans == NULL) { if (!trans) {
ret = -EADDRNOTAVAIL; ret = -EADDRNOTAVAIL;
rds_remove_bound(rs); rds_remove_bound(rs);
if (printk_ratelimit()) if (printk_ratelimit())
...@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ...@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
out: out:
release_sock(sk); release_sock(sk);
/* we might have called rds_remove_bound on error */
if (ret)
synchronize_rcu();
return ret; return ret;
} }
...@@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) ...@@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
unsigned long flags; unsigned long flags;
map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
if (map == NULL) if (!map)
return NULL; return NULL;
map->m_addr = addr; map->m_addr = addr;
...@@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) ...@@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
ret = rds_cong_tree_walk(addr, map); ret = rds_cong_tree_walk(addr, map);
spin_unlock_irqrestore(&rds_cong_lock, flags); spin_unlock_irqrestore(&rds_cong_lock, flags);
if (ret == NULL) { if (!ret) {
ret = map; ret = map;
map = NULL; map = NULL;
} }
...@@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn) ...@@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn)
conn->c_lcong = rds_cong_from_addr(conn->c_laddr); conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
conn->c_fcong = rds_cong_from_addr(conn->c_faddr); conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
if (conn->c_lcong == NULL || conn->c_fcong == NULL) if (!(conn->c_lcong && conn->c_fcong))
return -ENOMEM; return -ENOMEM;
return 0; return 0;
...@@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map) ...@@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
list_for_each_entry(conn, &map->m_conn_list, c_map_item) { list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
if (!test_and_set_bit(0, &conn->c_map_queued)) { if (!test_and_set_bit(0, &conn->c_map_queued)) {
rds_stats_inc(s_cong_update_queued); rds_stats_inc(s_cong_update_queued);
queue_delayed_work(rds_wq, &conn->c_send_w, 0); rds_send_xmit(conn);
} }
} }
......
...@@ -37,7 +37,6 @@ ...@@ -37,7 +37,6 @@
#include "rds.h" #include "rds.h"
#include "loop.h" #include "loop.h"
#include "rdma.h"
#define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_BITS 12
#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
...@@ -63,18 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) ...@@ -63,18 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
} while (0) } while (0)
static inline int rds_conn_is_sending(struct rds_connection *conn) /* rcu read lock must be held or the connection spinlock */
{
int ret = 0;
if (!mutex_trylock(&conn->c_send_lock))
ret = 1;
else
mutex_unlock(&conn->c_send_lock);
return ret;
}
static struct rds_connection *rds_conn_lookup(struct hlist_head *head, static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
__be32 laddr, __be32 faddr, __be32 laddr, __be32 faddr,
struct rds_transport *trans) struct rds_transport *trans)
...@@ -82,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, ...@@ -82,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
struct rds_connection *conn, *ret = NULL; struct rds_connection *conn, *ret = NULL;
struct hlist_node *pos; struct hlist_node *pos;
hlist_for_each_entry(conn, pos, head, c_hash_node) { hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
if (conn->c_faddr == faddr && conn->c_laddr == laddr && if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
conn->c_trans == trans) { conn->c_trans == trans) {
ret = conn; ret = conn;
...@@ -129,10 +117,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, ...@@ -129,10 +117,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
{ {
struct rds_connection *conn, *parent = NULL; struct rds_connection *conn, *parent = NULL;
struct hlist_head *head = rds_conn_bucket(laddr, faddr); struct hlist_head *head = rds_conn_bucket(laddr, faddr);
struct rds_transport *loop_trans;
unsigned long flags; unsigned long flags;
int ret; int ret;
spin_lock_irqsave(&rds_conn_lock, flags); rcu_read_lock();
conn = rds_conn_lookup(head, laddr, faddr, trans); conn = rds_conn_lookup(head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
!is_outgoing) { !is_outgoing) {
...@@ -143,12 +132,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, ...@@ -143,12 +132,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
parent = conn; parent = conn;
conn = parent->c_passive; conn = parent->c_passive;
} }
spin_unlock_irqrestore(&rds_conn_lock, flags); rcu_read_unlock();
if (conn) if (conn)
goto out; goto out;
conn = kmem_cache_zalloc(rds_conn_slab, gfp); conn = kmem_cache_zalloc(rds_conn_slab, gfp);
if (conn == NULL) { if (!conn) {
conn = ERR_PTR(-ENOMEM); conn = ERR_PTR(-ENOMEM);
goto out; goto out;
} }
...@@ -159,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, ...@@ -159,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
spin_lock_init(&conn->c_lock); spin_lock_init(&conn->c_lock);
conn->c_next_tx_seq = 1; conn->c_next_tx_seq = 1;
mutex_init(&conn->c_send_lock); init_waitqueue_head(&conn->c_waitq);
INIT_LIST_HEAD(&conn->c_send_queue); INIT_LIST_HEAD(&conn->c_send_queue);
INIT_LIST_HEAD(&conn->c_retrans); INIT_LIST_HEAD(&conn->c_retrans);
...@@ -175,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, ...@@ -175,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
* can bind to the destination address then we'd rather the messages * can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport. * flow through loopback rather than either transport.
*/ */
if (rds_trans_get_preferred(faddr)) { loop_trans = rds_trans_get_preferred(faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1; conn->c_loopback = 1;
if (is_outgoing && trans->t_prefer_loopback) { if (is_outgoing && trans->t_prefer_loopback) {
/* "outgoing" connection - and the transport /* "outgoing" connection - and the transport
...@@ -238,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, ...@@ -238,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
kmem_cache_free(rds_conn_slab, conn); kmem_cache_free(rds_conn_slab, conn);
conn = found; conn = found;
} else { } else {
hlist_add_head(&conn->c_hash_node, head); hlist_add_head_rcu(&conn->c_hash_node, head);
rds_cong_add_conn(conn); rds_cong_add_conn(conn);
rds_conn_count++; rds_conn_count++;
} }
...@@ -263,21 +254,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, ...@@ -263,21 +254,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
} }
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
void rds_conn_shutdown(struct rds_connection *conn)
{
/* shut it down unless it's down already */
if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
/*
* Quiesce the connection mgmt handlers before we start tearing
* things down. We don't hold the mutex for the entire
* duration of the shutdown operation, else we may be
* deadlocking with the CM handler. Instead, the CM event
* handler is supposed to check for state DISCONNECTING
*/
mutex_lock(&conn->c_cm_lock);
if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
&& !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
rds_conn_error(conn, "shutdown called in state %d\n",
atomic_read(&conn->c_state));
mutex_unlock(&conn->c_cm_lock);
return;
}
mutex_unlock(&conn->c_cm_lock);
wait_event(conn->c_waitq,
!test_bit(RDS_IN_XMIT, &conn->c_flags));
conn->c_trans->conn_shutdown(conn);
rds_conn_reset(conn);
if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
/* This can happen - eg when we're in the middle of tearing
* down the connection, and someone unloads the rds module.
* Quite reproduceable with loopback connections.
* Mostly harmless.
*/
rds_conn_error(conn,
"%s: failed to transition to state DOWN, "
"current state is %d\n",
__func__,
atomic_read(&conn->c_state));
return;
}
}
/* Then reconnect if it's still live.
* The passive side of an IB loopback connection is never added
* to the conn hash, so we never trigger a reconnect on this
* conn - the reconnect is always triggered by the active peer. */
cancel_delayed_work_sync(&conn->c_conn_w);
rcu_read_lock();
if (!hlist_unhashed(&conn->c_hash_node)) {
rcu_read_unlock();
rds_queue_reconnect(conn);
} else {
rcu_read_unlock();
}
}
/*
* Stop and free a connection.
*
* This can only be used in very limited circumstances. It assumes that once
* the conn has been shutdown that no one else is referencing the connection.
* We can only ensure this in the rmmod path in the current code.
*/
void rds_conn_destroy(struct rds_connection *conn) void rds_conn_destroy(struct rds_connection *conn)
{ {
struct rds_message *rm, *rtmp; struct rds_message *rm, *rtmp;
unsigned long flags;
rdsdebug("freeing conn %p for %pI4 -> " rdsdebug("freeing conn %p for %pI4 -> "
"%pI4\n", conn, &conn->c_laddr, "%pI4\n", conn, &conn->c_laddr,
&conn->c_faddr); &conn->c_faddr);
hlist_del_init(&conn->c_hash_node); /* Ensure conn will not be scheduled for reconnect */
spin_lock_irq(&rds_conn_lock);
hlist_del_init_rcu(&conn->c_hash_node);
spin_unlock_irq(&rds_conn_lock);
synchronize_rcu();
/* wait for the rds thread to shut it down */ /* shut the connection down */
atomic_set(&conn->c_state, RDS_CONN_ERROR); rds_conn_drop(conn);
cancel_delayed_work(&conn->c_conn_w); flush_work(&conn->c_down_w);
queue_work(rds_wq, &conn->c_down_w);
flush_workqueue(rds_wq); /* make sure lingering queued work won't try to ref the conn */
cancel_delayed_work_sync(&conn->c_send_w);
cancel_delayed_work_sync(&conn->c_recv_w);
/* tear down queued messages */ /* tear down queued messages */
list_for_each_entry_safe(rm, rtmp, list_for_each_entry_safe(rm, rtmp,
...@@ -302,7 +363,9 @@ void rds_conn_destroy(struct rds_connection *conn) ...@@ -302,7 +363,9 @@ void rds_conn_destroy(struct rds_connection *conn)
BUG_ON(!list_empty(&conn->c_retrans)); BUG_ON(!list_empty(&conn->c_retrans));
kmem_cache_free(rds_conn_slab, conn); kmem_cache_free(rds_conn_slab, conn);
spin_lock_irqsave(&rds_conn_lock, flags);
rds_conn_count--; rds_conn_count--;
spin_unlock_irqrestore(&rds_conn_lock, flags);
} }
EXPORT_SYMBOL_GPL(rds_conn_destroy); EXPORT_SYMBOL_GPL(rds_conn_destroy);
...@@ -316,23 +379,23 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, ...@@ -316,23 +379,23 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
struct list_head *list; struct list_head *list;
struct rds_connection *conn; struct rds_connection *conn;
struct rds_message *rm; struct rds_message *rm;
unsigned long flags;
unsigned int total = 0; unsigned int total = 0;
unsigned long flags;
size_t i; size_t i;
len /= sizeof(struct rds_info_message); len /= sizeof(struct rds_info_message);
spin_lock_irqsave(&rds_conn_lock, flags); rcu_read_lock();
for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
i++, head++) { i++, head++) {
hlist_for_each_entry(conn, pos, head, c_hash_node) { hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
if (want_send) if (want_send)
list = &conn->c_send_queue; list = &conn->c_send_queue;
else else
list = &conn->c_retrans; list = &conn->c_retrans;
spin_lock(&conn->c_lock); spin_lock_irqsave(&conn->c_lock, flags);
/* XXX too lazy to maintain counts.. */ /* XXX too lazy to maintain counts.. */
list_for_each_entry(rm, list, m_conn_item) { list_for_each_entry(rm, list, m_conn_item) {
...@@ -343,11 +406,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, ...@@ -343,11 +406,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
conn->c_faddr, 0); conn->c_faddr, 0);
} }
spin_unlock(&conn->c_lock); spin_unlock_irqrestore(&conn->c_lock, flags);
} }
} }
rcu_read_unlock();
spin_unlock_irqrestore(&rds_conn_lock, flags);
lens->nr = total; lens->nr = total;
lens->each = sizeof(struct rds_info_message); lens->each = sizeof(struct rds_info_message);
...@@ -377,19 +439,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, ...@@ -377,19 +439,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
uint64_t buffer[(item_len + 7) / 8]; uint64_t buffer[(item_len + 7) / 8];
struct hlist_head *head; struct hlist_head *head;
struct hlist_node *pos; struct hlist_node *pos;
struct hlist_node *tmp;
struct rds_connection *conn; struct rds_connection *conn;
unsigned long flags;
size_t i; size_t i;
spin_lock_irqsave(&rds_conn_lock, flags); rcu_read_lock();
lens->nr = 0; lens->nr = 0;
lens->each = item_len; lens->each = item_len;
for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
i++, head++) { i++, head++) {
hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) {
/* XXX no c_lock usage.. */ /* XXX no c_lock usage.. */
if (!visitor(conn, buffer)) if (!visitor(conn, buffer))
...@@ -405,8 +465,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, ...@@ -405,8 +465,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
lens->nr++; lens->nr++;
} }
} }
rcu_read_unlock();
spin_unlock_irqrestore(&rds_conn_lock, flags);
} }
EXPORT_SYMBOL_GPL(rds_for_each_conn_info); EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
...@@ -423,8 +482,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn, ...@@ -423,8 +482,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn,
sizeof(cinfo->transport)); sizeof(cinfo->transport));
cinfo->flags = 0; cinfo->flags = 0;
rds_conn_info_set(cinfo->flags, rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
rds_conn_is_sending(conn), SENDING); SENDING);
/* XXX Future: return the state rather than these funky bits */ /* XXX Future: return the state rather than these funky bits */
rds_conn_info_set(cinfo->flags, rds_conn_info_set(cinfo->flags,
atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
...@@ -444,12 +503,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len, ...@@ -444,12 +503,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
sizeof(struct rds_info_connection)); sizeof(struct rds_info_connection));
} }
int __init rds_conn_init(void) int rds_conn_init(void)
{ {
rds_conn_slab = kmem_cache_create("rds_connection", rds_conn_slab = kmem_cache_create("rds_connection",
sizeof(struct rds_connection), sizeof(struct rds_connection),
0, 0, NULL); 0, 0, NULL);
if (rds_conn_slab == NULL) if (!rds_conn_slab)
return -ENOMEM; return -ENOMEM;
rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
...@@ -486,6 +545,18 @@ void rds_conn_drop(struct rds_connection *conn) ...@@ -486,6 +545,18 @@ void rds_conn_drop(struct rds_connection *conn)
} }
EXPORT_SYMBOL_GPL(rds_conn_drop); EXPORT_SYMBOL_GPL(rds_conn_drop);
/*
* If the connection is down, trigger a connect. We may have scheduled a
* delayed reconnect however - in this case we should not interfere.
*/
void rds_conn_connect_if_down(struct rds_connection *conn)
{
if (rds_conn_state(conn) == RDS_CONN_DOWN &&
!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
}
EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
/* /*
* An error occurred on the connection * An error occurred on the connection
*/ */
......
...@@ -53,12 +53,71 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); ...@@ -53,12 +53,71 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
module_param(rds_ib_retry_count, int, 0444); module_param(rds_ib_retry_count, int, 0444);
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
/*
* we have a clumsy combination of RCU and a rwsem protecting this list
* because it is used both in the get_mr fast path and while blocking in
* the FMR flushing path.
*/
DECLARE_RWSEM(rds_ib_devices_lock);
struct list_head rds_ib_devices; struct list_head rds_ib_devices;
/* NOTE: if also grabbing ibdev lock, grab this first */ /* NOTE: if also grabbing ibdev lock, grab this first */
DEFINE_SPINLOCK(ib_nodev_conns_lock); DEFINE_SPINLOCK(ib_nodev_conns_lock);
LIST_HEAD(ib_nodev_conns); LIST_HEAD(ib_nodev_conns);
void rds_ib_nodev_connect(void)
{
struct rds_ib_connection *ic;
spin_lock(&ib_nodev_conns_lock);
list_for_each_entry(ic, &ib_nodev_conns, ib_node)
rds_conn_connect_if_down(ic->conn);
spin_unlock(&ib_nodev_conns_lock);
}
void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
{
struct rds_ib_connection *ic;
unsigned long flags;
spin_lock_irqsave(&rds_ibdev->spinlock, flags);
list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
rds_conn_drop(ic->conn);
spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
}
/*
* rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
* from interrupt context so we push freing off into a work struct in krdsd.
*/
static void rds_ib_dev_free(struct work_struct *work)
{
struct rds_ib_ipaddr *i_ipaddr, *i_next;
struct rds_ib_device *rds_ibdev = container_of(work,
struct rds_ib_device, free_work);
if (rds_ibdev->mr_pool)
rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
if (rds_ibdev->mr)
ib_dereg_mr(rds_ibdev->mr);
if (rds_ibdev->pd)
ib_dealloc_pd(rds_ibdev->pd);
list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
list_del(&i_ipaddr->list);
kfree(i_ipaddr);
}
kfree(rds_ibdev);
}
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
{
BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
if (atomic_dec_and_test(&rds_ibdev->refcount))
queue_work(rds_wq, &rds_ibdev->free_work);
}
void rds_ib_add_one(struct ib_device *device) void rds_ib_add_one(struct ib_device *device)
{ {
struct rds_ib_device *rds_ibdev; struct rds_ib_device *rds_ibdev;
...@@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device) ...@@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device)
goto free_attr; goto free_attr;
} }
rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
ibdev_to_node(device));
if (!rds_ibdev) if (!rds_ibdev)
goto free_attr; goto free_attr;
spin_lock_init(&rds_ibdev->spinlock); spin_lock_init(&rds_ibdev->spinlock);
atomic_set(&rds_ibdev->refcount, 1);
INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
rds_ibdev->max_wrs = dev_attr->max_qp_wr; rds_ibdev->max_wrs = dev_attr->max_qp_wr;
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
...@@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device) ...@@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device)
min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
fmr_pool_size; fmr_pool_size;
rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
rds_ibdev->dev = device; rds_ibdev->dev = device;
rds_ibdev->pd = ib_alloc_pd(device); rds_ibdev->pd = ib_alloc_pd(device);
if (IS_ERR(rds_ibdev->pd)) if (IS_ERR(rds_ibdev->pd)) {
goto free_dev; rds_ibdev->pd = NULL;
goto put_dev;
}
rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
IB_ACCESS_LOCAL_WRITE); if (IS_ERR(rds_ibdev->mr)) {
if (IS_ERR(rds_ibdev->mr)) rds_ibdev->mr = NULL;
goto err_pd; goto put_dev;
}
rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
if (IS_ERR(rds_ibdev->mr_pool)) { if (IS_ERR(rds_ibdev->mr_pool)) {
rds_ibdev->mr_pool = NULL; rds_ibdev->mr_pool = NULL;
goto err_mr; goto put_dev;
} }
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
INIT_LIST_HEAD(&rds_ibdev->conn_list); INIT_LIST_HEAD(&rds_ibdev->conn_list);
list_add_tail(&rds_ibdev->list, &rds_ib_devices);
down_write(&rds_ib_devices_lock);
list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
up_write(&rds_ib_devices_lock);
atomic_inc(&rds_ibdev->refcount);
ib_set_client_data(device, &rds_ib_client, rds_ibdev); ib_set_client_data(device, &rds_ib_client, rds_ibdev);
atomic_inc(&rds_ibdev->refcount);
goto free_attr; rds_ib_nodev_connect();
err_mr: put_dev:
ib_dereg_mr(rds_ibdev->mr); rds_ib_dev_put(rds_ibdev);
err_pd:
ib_dealloc_pd(rds_ibdev->pd);
free_dev:
kfree(rds_ibdev);
free_attr: free_attr:
kfree(dev_attr); kfree(dev_attr);
} }
/*
* New connections use this to find the device to associate with the
* connection. It's not in the fast path so we're not concerned about the
* performance of the IB call. (As of this writing, it uses an interrupt
* blocking spinlock to serialize walking a per-device list of all registered
* clients.)
*
* RCU is used to handle incoming connections racing with device teardown.
* Rather than use a lock to serialize removal from the client_data and
* getting a new reference, we use an RCU grace period. The destruction
* path removes the device from client_data and then waits for all RCU
* readers to finish.
*
* A new connection can get NULL from this if its arriving on a
* device that is in the process of being removed.
*/
struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
rcu_read_lock();
rds_ibdev = ib_get_client_data(device, &rds_ib_client);
if (rds_ibdev)
atomic_inc(&rds_ibdev->refcount);
rcu_read_unlock();
return rds_ibdev;
}
/*
* The IB stack is letting us know that a device is going away. This can
* happen if the underlying HCA driver is removed or if PCI hotplug is removing
* the pci function, for example.
*
* This can be called at any time and can be racing with any other RDS path.
*/
void rds_ib_remove_one(struct ib_device *device) void rds_ib_remove_one(struct ib_device *device)
{ {
struct rds_ib_device *rds_ibdev; struct rds_ib_device *rds_ibdev;
struct rds_ib_ipaddr *i_ipaddr, *i_next;
rds_ibdev = ib_get_client_data(device, &rds_ib_client); rds_ibdev = ib_get_client_data(device, &rds_ib_client);
if (!rds_ibdev) if (!rds_ibdev)
return; return;
list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { rds_ib_dev_shutdown(rds_ibdev);
list_del(&i_ipaddr->list);
kfree(i_ipaddr);
}
rds_ib_destroy_conns(rds_ibdev); /* stop connection attempts from getting a reference to this device. */
ib_set_client_data(device, &rds_ib_client, NULL);
if (rds_ibdev->mr_pool) down_write(&rds_ib_devices_lock);
rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); list_del_rcu(&rds_ibdev->list);
up_write(&rds_ib_devices_lock);
ib_dereg_mr(rds_ibdev->mr);
while (ib_dealloc_pd(rds_ibdev->pd)) {
rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
msleep(1);
}
list_del(&rds_ibdev->list); /*
kfree(rds_ibdev); * This synchronize rcu is waiting for readers of both the ib
* client data and the devices list to finish before we drop
* both of those references.
*/
synchronize_rcu();
rds_ib_dev_put(rds_ibdev);
rds_ib_dev_put(rds_ibdev);
} }
struct ib_client rds_ib_client = { struct ib_client rds_ib_client = {
...@@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, ...@@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); rds_ibdev = ic->rds_ibdev;
iinfo->max_send_wr = ic->i_send_ring.w_nr; iinfo->max_send_wr = ic->i_send_ring.w_nr;
iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo->max_send_sge = rds_ibdev->max_sge; iinfo->max_send_sge = rds_ibdev->max_sge;
...@@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr) ...@@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr)
return ret; return ret;
} }
static void rds_ib_unregister_client(void)
{
ib_unregister_client(&rds_ib_client);
/* wait for rds_ib_dev_free() to complete */
flush_workqueue(rds_wq);
}
void rds_ib_exit(void) void rds_ib_exit(void)
{ {
rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
rds_ib_unregister_client();
rds_ib_destroy_nodev_conns(); rds_ib_destroy_nodev_conns();
ib_unregister_client(&rds_ib_client);
rds_ib_sysctl_exit(); rds_ib_sysctl_exit();
rds_ib_recv_exit(); rds_ib_recv_exit();
rds_trans_unregister(&rds_ib_transport); rds_trans_unregister(&rds_ib_transport);
rds_ib_fmr_exit();
} }
struct rds_transport rds_ib_transport = { struct rds_transport rds_ib_transport = {
.laddr_check = rds_ib_laddr_check, .laddr_check = rds_ib_laddr_check,
.xmit_complete = rds_ib_xmit_complete, .xmit_complete = rds_ib_xmit_complete,
.xmit = rds_ib_xmit, .xmit = rds_ib_xmit,
.xmit_cong_map = NULL,
.xmit_rdma = rds_ib_xmit_rdma, .xmit_rdma = rds_ib_xmit_rdma,
.xmit_atomic = rds_ib_xmit_atomic,
.recv = rds_ib_recv, .recv = rds_ib_recv,
.conn_alloc = rds_ib_conn_alloc, .conn_alloc = rds_ib_conn_alloc,
.conn_free = rds_ib_conn_free, .conn_free = rds_ib_conn_free,
.conn_connect = rds_ib_conn_connect, .conn_connect = rds_ib_conn_connect,
.conn_shutdown = rds_ib_conn_shutdown, .conn_shutdown = rds_ib_conn_shutdown,
.inc_copy_to_user = rds_ib_inc_copy_to_user, .inc_copy_to_user = rds_ib_inc_copy_to_user,
.inc_purge = rds_ib_inc_purge,
.inc_free = rds_ib_inc_free, .inc_free = rds_ib_inc_free,
.cm_initiate_connect = rds_ib_cm_initiate_connect, .cm_initiate_connect = rds_ib_cm_initiate_connect,
.cm_handle_connect = rds_ib_cm_handle_connect, .cm_handle_connect = rds_ib_cm_handle_connect,
...@@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = { ...@@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = {
.t_type = RDS_TRANS_IB .t_type = RDS_TRANS_IB
}; };
int __init rds_ib_init(void) int rds_ib_init(void)
{ {
int ret; int ret;
INIT_LIST_HEAD(&rds_ib_devices); INIT_LIST_HEAD(&rds_ib_devices);
ret = ib_register_client(&rds_ib_client); ret = rds_ib_fmr_init();
if (ret) if (ret)
goto out; goto out;
ret = ib_register_client(&rds_ib_client);
if (ret)
goto out_fmr_exit;
ret = rds_ib_sysctl_init(); ret = rds_ib_sysctl_init();
if (ret) if (ret)
goto out_ibreg; goto out_ibreg;
...@@ -317,7 +429,9 @@ int __init rds_ib_init(void) ...@@ -317,7 +429,9 @@ int __init rds_ib_init(void)
out_sysctl: out_sysctl:
rds_ib_sysctl_exit(); rds_ib_sysctl_exit();
out_ibreg: out_ibreg:
ib_unregister_client(&rds_ib_client); rds_ib_unregister_client();
out_fmr_exit:
rds_ib_fmr_exit();
out: out:
return ret; return ret;
} }
......
...@@ -3,11 +3,13 @@ ...@@ -3,11 +3,13 @@
#include <rdma/ib_verbs.h> #include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h> #include <rdma/rdma_cm.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include "rds.h" #include "rds.h"
#include "rdma_transport.h" #include "rdma_transport.h"
#define RDS_FMR_SIZE 256 #define RDS_FMR_SIZE 256
#define RDS_FMR_POOL_SIZE 4096 #define RDS_FMR_POOL_SIZE 8192
#define RDS_IB_MAX_SGE 8 #define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2 #define RDS_IB_RECV_SGE 2
...@@ -19,6 +21,9 @@ ...@@ -19,6 +21,9 @@
#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
#define RDS_IB_RECYCLE_BATCH_COUNT 32
extern struct rw_semaphore rds_ib_devices_lock;
extern struct list_head rds_ib_devices; extern struct list_head rds_ib_devices;
/* /*
...@@ -26,20 +31,29 @@ extern struct list_head rds_ib_devices; ...@@ -26,20 +31,29 @@ extern struct list_head rds_ib_devices;
* try and minimize the amount of memory tied up both the device and * try and minimize the amount of memory tied up both the device and
* socket receive queues. * socket receive queues.
*/ */
/* page offset of the final full frag that fits in the page */
#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
struct rds_page_frag { struct rds_page_frag {
struct list_head f_item; struct list_head f_item;
struct page *f_page; struct list_head f_cache_entry;
unsigned long f_offset; struct scatterlist f_sg;
dma_addr_t f_mapped;
}; };
struct rds_ib_incoming { struct rds_ib_incoming {
struct list_head ii_frags; struct list_head ii_frags;
struct list_head ii_cache_entry;
struct rds_incoming ii_inc; struct rds_incoming ii_inc;
}; };
struct rds_ib_cache_head {
struct list_head *first;
unsigned long count;
};
struct rds_ib_refill_cache {
struct rds_ib_cache_head *percpu;
struct list_head *xfer;
struct list_head *ready;
};
struct rds_ib_connect_private { struct rds_ib_connect_private {
/* Add new fields at the end, and don't permute existing fields. */ /* Add new fields at the end, and don't permute existing fields. */
__be32 dp_saddr; __be32 dp_saddr;
...@@ -53,8 +67,7 @@ struct rds_ib_connect_private { ...@@ -53,8 +67,7 @@ struct rds_ib_connect_private {
}; };
struct rds_ib_send_work { struct rds_ib_send_work {
struct rds_message *s_rm; void *s_op;
struct rds_rdma_op *s_op;
struct ib_send_wr s_wr; struct ib_send_wr s_wr;
struct ib_sge s_sge[RDS_IB_MAX_SGE]; struct ib_sge s_sge[RDS_IB_MAX_SGE];
unsigned long s_queued; unsigned long s_queued;
...@@ -92,10 +105,11 @@ struct rds_ib_connection { ...@@ -92,10 +105,11 @@ struct rds_ib_connection {
/* tx */ /* tx */
struct rds_ib_work_ring i_send_ring; struct rds_ib_work_ring i_send_ring;
struct rds_message *i_rm; struct rm_data_op *i_data_op;
struct rds_header *i_send_hdrs; struct rds_header *i_send_hdrs;
u64 i_send_hdrs_dma; u64 i_send_hdrs_dma;
struct rds_ib_send_work *i_sends; struct rds_ib_send_work *i_sends;
atomic_t i_signaled_sends;
/* rx */ /* rx */
struct tasklet_struct i_recv_tasklet; struct tasklet_struct i_recv_tasklet;
...@@ -106,8 +120,9 @@ struct rds_ib_connection { ...@@ -106,8 +120,9 @@ struct rds_ib_connection {
struct rds_header *i_recv_hdrs; struct rds_header *i_recv_hdrs;
u64 i_recv_hdrs_dma; u64 i_recv_hdrs_dma;
struct rds_ib_recv_work *i_recvs; struct rds_ib_recv_work *i_recvs;
struct rds_page_frag i_frag;
u64 i_ack_recv; /* last ACK received */ u64 i_ack_recv; /* last ACK received */
struct rds_ib_refill_cache i_cache_incs;
struct rds_ib_refill_cache i_cache_frags;
/* sending acks */ /* sending acks */
unsigned long i_ack_flags; unsigned long i_ack_flags;
...@@ -138,7 +153,6 @@ struct rds_ib_connection { ...@@ -138,7 +153,6 @@ struct rds_ib_connection {
/* Batched completions */ /* Batched completions */
unsigned int i_unsignaled_wrs; unsigned int i_unsignaled_wrs;
long i_unsignaled_bytes;
}; };
/* This assumes that atomic_t is at least 32 bits */ /* This assumes that atomic_t is at least 32 bits */
...@@ -164,9 +178,17 @@ struct rds_ib_device { ...@@ -164,9 +178,17 @@ struct rds_ib_device {
unsigned int max_fmrs; unsigned int max_fmrs;
int max_sge; int max_sge;
unsigned int max_wrs; unsigned int max_wrs;
unsigned int max_initiator_depth;
unsigned int max_responder_resources;
spinlock_t spinlock; /* protect the above */ spinlock_t spinlock; /* protect the above */
atomic_t refcount;
struct work_struct free_work;
}; };
#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device))
#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
/* bits for i_ack_flags */ /* bits for i_ack_flags */
#define IB_ACK_IN_FLIGHT 0 #define IB_ACK_IN_FLIGHT 0
#define IB_ACK_REQUESTED 1 #define IB_ACK_REQUESTED 1
...@@ -202,6 +224,8 @@ struct rds_ib_statistics { ...@@ -202,6 +224,8 @@ struct rds_ib_statistics {
uint64_t s_ib_rdma_mr_pool_flush; uint64_t s_ib_rdma_mr_pool_flush;
uint64_t s_ib_rdma_mr_pool_wait; uint64_t s_ib_rdma_mr_pool_wait;
uint64_t s_ib_rdma_mr_pool_depleted; uint64_t s_ib_rdma_mr_pool_depleted;
uint64_t s_ib_atomic_cswp;
uint64_t s_ib_atomic_fadd;
}; };
extern struct workqueue_struct *rds_ib_wq; extern struct workqueue_struct *rds_ib_wq;
...@@ -243,6 +267,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, ...@@ -243,6 +267,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
extern struct rds_transport rds_ib_transport; extern struct rds_transport rds_ib_transport;
extern void rds_ib_add_one(struct ib_device *device); extern void rds_ib_add_one(struct ib_device *device);
extern void rds_ib_remove_one(struct ib_device *device); extern void rds_ib_remove_one(struct ib_device *device);
struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
extern struct ib_client rds_ib_client; extern struct ib_client rds_ib_client;
extern unsigned int fmr_pool_size; extern unsigned int fmr_pool_size;
...@@ -258,7 +284,7 @@ void rds_ib_conn_free(void *arg); ...@@ -258,7 +284,7 @@ void rds_ib_conn_free(void *arg);
int rds_ib_conn_connect(struct rds_connection *conn); int rds_ib_conn_connect(struct rds_connection *conn);
void rds_ib_conn_shutdown(struct rds_connection *conn); void rds_ib_conn_shutdown(struct rds_connection *conn);
void rds_ib_state_change(struct sock *sk); void rds_ib_state_change(struct sock *sk);
int __init rds_ib_listen_init(void); int rds_ib_listen_init(void);
void rds_ib_listen_stop(void); void rds_ib_listen_stop(void);
void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
...@@ -275,15 +301,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, ...@@ -275,15 +301,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock); void rds_ib_destroy_nodev_conns(void);
static inline void rds_ib_destroy_nodev_conns(void)
{
__rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
}
static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
{
__rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
}
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
...@@ -292,14 +310,16 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, ...@@ -292,14 +310,16 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void); void rds_ib_flush_mrs(void);
int rds_ib_fmr_init(void);
void rds_ib_fmr_exit(void);
/* ib_recv.c */ /* ib_recv.c */
int __init rds_ib_recv_init(void); int rds_ib_recv_init(void);
void rds_ib_recv_exit(void); void rds_ib_recv_exit(void);
int rds_ib_recv(struct rds_connection *conn); int rds_ib_recv(struct rds_connection *conn);
int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
gfp_t page_gfp, int prefill); void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
void rds_ib_inc_purge(struct rds_incoming *inc); void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
void rds_ib_inc_free(struct rds_incoming *inc); void rds_ib_inc_free(struct rds_incoming *inc);
int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
size_t size); size_t size);
...@@ -325,17 +345,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); ...@@ -325,17 +345,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
extern wait_queue_head_t rds_ib_ring_empty_wait; extern wait_queue_head_t rds_ib_ring_empty_wait;
/* ib_send.c */ /* ib_send.c */
char *rds_ib_wc_status_str(enum ib_wc_status status);
void rds_ib_xmit_complete(struct rds_connection *conn); void rds_ib_xmit_complete(struct rds_connection *conn);
int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off); unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
void rds_ib_send_init_ring(struct rds_ib_connection *ic); void rds_ib_send_init_ring(struct rds_ib_connection *ic);
void rds_ib_send_clear_ring(struct rds_ib_connection *ic); void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
u32 *adv_credits, int need_posted, int max_posted); u32 *adv_credits, int need_posted, int max_posted);
int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
/* ib_stats.c */ /* ib_stats.c */
DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
...@@ -344,7 +366,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, ...@@ -344,7 +366,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail); unsigned int avail);
/* ib_sysctl.c */ /* ib_sysctl.c */
int __init rds_ib_sysctl_init(void); int rds_ib_sysctl_init(void);
void rds_ib_sysctl_exit(void); void rds_ib_sysctl_exit(void);
extern unsigned long rds_ib_sysctl_max_send_wr; extern unsigned long rds_ib_sysctl_max_send_wr;
extern unsigned long rds_ib_sysctl_max_recv_wr; extern unsigned long rds_ib_sysctl_max_recv_wr;
...@@ -354,28 +376,4 @@ extern unsigned long rds_ib_sysctl_max_recv_allocation; ...@@ -354,28 +376,4 @@ extern unsigned long rds_ib_sysctl_max_recv_allocation;
extern unsigned int rds_ib_sysctl_flow_control; extern unsigned int rds_ib_sysctl_flow_control;
extern ctl_table rds_ib_sysctl_table[]; extern ctl_table rds_ib_sysctl_table[];
/*
* Helper functions for getting/setting the header and data SGEs in
* RDS packets (not RDMA)
*
* From version 3.1 onwards, header is in front of data in the sge.
*/
static inline struct ib_sge *
rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
{
if (ic->conn->c_version > RDS_PROTOCOL_3_0)
return &sge[0];
else
return &sge[1];
}
static inline struct ib_sge *
rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
{
if (ic->conn->c_version > RDS_PROTOCOL_3_0)
return &sge[1];
else
return &sge[0];
}
#endif #endif
...@@ -38,6 +38,36 @@ ...@@ -38,6 +38,36 @@
#include "rds.h" #include "rds.h"
#include "ib.h" #include "ib.h"
static char *rds_ib_event_type_strings[] = {
#define RDS_IB_EVENT_STRING(foo) \
[IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
RDS_IB_EVENT_STRING(CQ_ERR),
RDS_IB_EVENT_STRING(QP_FATAL),
RDS_IB_EVENT_STRING(QP_REQ_ERR),
RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
RDS_IB_EVENT_STRING(COMM_EST),
RDS_IB_EVENT_STRING(SQ_DRAINED),
RDS_IB_EVENT_STRING(PATH_MIG),
RDS_IB_EVENT_STRING(PATH_MIG_ERR),
RDS_IB_EVENT_STRING(DEVICE_FATAL),
RDS_IB_EVENT_STRING(PORT_ACTIVE),
RDS_IB_EVENT_STRING(PORT_ERR),
RDS_IB_EVENT_STRING(LID_CHANGE),
RDS_IB_EVENT_STRING(PKEY_CHANGE),
RDS_IB_EVENT_STRING(SM_CHANGE),
RDS_IB_EVENT_STRING(SRQ_ERR),
RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
#undef RDS_IB_EVENT_STRING
};
static char *rds_ib_event_str(enum ib_event_type type)
{
return rds_str_array(rds_ib_event_type_strings,
ARRAY_SIZE(rds_ib_event_type_strings), type);
};
/* /*
* Set the selected protocol version * Set the selected protocol version
*/ */
...@@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even ...@@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
{ {
const struct rds_ib_connect_private *dp = NULL; const struct rds_ib_connect_private *dp = NULL;
struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_device *rds_ibdev;
struct ib_qp_attr qp_attr; struct ib_qp_attr qp_attr;
int err; int err;
...@@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even ...@@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
} }
} }
printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", if (conn->c_version < RDS_PROTOCOL(3,1)) {
&conn->c_faddr, printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
RDS_PROTOCOL_MAJOR(conn->c_version), " no longer supported\n",
RDS_PROTOCOL_MINOR(conn->c_version), &conn->c_faddr,
ic->i_flowctl ? ", flow control" : ""); RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version));
rds_conn_destroy(conn);
return;
} else {
printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
&conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version),
ic->i_flowctl ? ", flow control" : "");
}
/* /*
* Init rings and fill recv. this needs to wait until protocol negotiation * Init rings and fill recv. this needs to wait until protocol negotiation
...@@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even ...@@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_ib_recv_init_ring(ic); rds_ib_recv_init_ring(ic);
/* Post receive buffers - as a side effect, this will update /* Post receive buffers - as a side effect, this will update
* the posted credit count. */ * the posted credit count. */
rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); rds_ib_recv_refill(conn, 1);
/* Tune RNR behavior */ /* Tune RNR behavior */
rds_ib_tune_rnr(ic, &qp_attr); rds_ib_tune_rnr(ic, &qp_attr);
...@@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even ...@@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
if (err) if (err)
printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
/* update ib_device with this local ipaddr & conn */ /* update ib_device with this local ipaddr */
rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
if (err) if (err)
printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
rds_ib_add_conn(rds_ibdev, conn); err);
/* If the peer gave us the last packet it saw, process this as if /* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */ * we had received a regular ACK. */
...@@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even ...@@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
struct rdma_conn_param *conn_param, struct rdma_conn_param *conn_param,
struct rds_ib_connect_private *dp, struct rds_ib_connect_private *dp,
u32 protocol_version) u32 protocol_version,
u32 max_responder_resources,
u32 max_initiator_depth)
{ {
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
memset(conn_param, 0, sizeof(struct rdma_conn_param)); memset(conn_param, 0, sizeof(struct rdma_conn_param));
/* XXX tune these? */
conn_param->responder_resources = 1; conn_param->responder_resources =
conn_param->initiator_depth = 1; min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
conn_param->initiator_depth =
min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
conn_param->rnr_retry_count = 7; conn_param->rnr_retry_count = 7;
if (dp) { if (dp) {
struct rds_ib_connection *ic = conn->c_transport_data;
memset(dp, 0, sizeof(*dp)); memset(dp, 0, sizeof(*dp));
dp->dp_saddr = conn->c_laddr; dp->dp_saddr = conn->c_laddr;
dp->dp_daddr = conn->c_faddr; dp->dp_daddr = conn->c_faddr;
...@@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, ...@@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
static void rds_ib_cq_event_handler(struct ib_event *event, void *data) static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
{ {
rdsdebug("event %u data %p\n", event->event, data); rdsdebug("event %u (%s) data %p\n",
event->event, rds_ib_event_str(event->event), data);
} }
static void rds_ib_qp_event_handler(struct ib_event *event, void *data) static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
...@@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) ...@@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
struct rds_connection *conn = data; struct rds_connection *conn = data;
struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_connection *ic = conn->c_transport_data;
rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
rds_ib_event_str(event->event));
switch (event->event) { switch (event->event) {
case IB_EVENT_COMM_EST: case IB_EVENT_COMM_EST:
rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
break; break;
default: default:
rdsdebug("Fatal QP Event %u " rdsdebug("Fatal QP Event %u (%s) "
"- connection %pI4->%pI4, reconnecting\n", "- connection %pI4->%pI4, reconnecting\n",
event->event, &conn->c_laddr, &conn->c_faddr); event->event, rds_ib_event_str(event->event),
&conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn); rds_conn_drop(conn);
break; break;
} }
...@@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ...@@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct rds_ib_device *rds_ibdev; struct rds_ib_device *rds_ibdev;
int ret; int ret;
/* rds_ib_add_one creates a rds_ib_device object per IB device, /*
* and allocates a protection domain, memory range and FMR pool * It's normal to see a null device if an incoming connection races
* for each. If that fails for any reason, it will not register * with device removal, so we don't print a warning.
* the rds_ibdev at all.
*/ */
rds_ibdev = ib_get_client_data(dev, &rds_ib_client); rds_ibdev = rds_ib_get_client_data(dev);
if (rds_ibdev == NULL) { if (!rds_ibdev)
if (printk_ratelimit())
printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
dev->name);
return -EOPNOTSUPP; return -EOPNOTSUPP;
}
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
...@@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ...@@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_send_ring.w_nr * ic->i_send_ring.w_nr *
sizeof(struct rds_header), sizeof(struct rds_header),
&ic->i_send_hdrs_dma, GFP_KERNEL); &ic->i_send_hdrs_dma, GFP_KERNEL);
if (ic->i_send_hdrs == NULL) { if (!ic->i_send_hdrs) {
ret = -ENOMEM; ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent send failed\n"); rdsdebug("ib_dma_alloc_coherent send failed\n");
goto out; goto out;
...@@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ...@@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_recv_ring.w_nr * ic->i_recv_ring.w_nr *
sizeof(struct rds_header), sizeof(struct rds_header),
&ic->i_recv_hdrs_dma, GFP_KERNEL); &ic->i_recv_hdrs_dma, GFP_KERNEL);
if (ic->i_recv_hdrs == NULL) { if (!ic->i_recv_hdrs) {
ret = -ENOMEM; ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent recv failed\n"); rdsdebug("ib_dma_alloc_coherent recv failed\n");
goto out; goto out;
...@@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ...@@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
&ic->i_ack_dma, GFP_KERNEL); &ic->i_ack_dma, GFP_KERNEL);
if (ic->i_ack == NULL) { if (!ic->i_ack) {
ret = -ENOMEM; ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent ack failed\n"); rdsdebug("ib_dma_alloc_coherent ack failed\n");
goto out; goto out;
} }
ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
if (ic->i_sends == NULL) { ibdev_to_node(dev));
if (!ic->i_sends) {
ret = -ENOMEM; ret = -ENOMEM;
rdsdebug("send allocation failed\n"); rdsdebug("send allocation failed\n");
goto out; goto out;
} }
memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
if (ic->i_recvs == NULL) { ibdev_to_node(dev));
if (!ic->i_recvs) {
ret = -ENOMEM; ret = -ENOMEM;
rdsdebug("recv allocation failed\n"); rdsdebug("recv allocation failed\n");
goto out; goto out;
...@@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ...@@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic->i_send_cq, ic->i_recv_cq); ic->i_send_cq, ic->i_recv_cq);
out: out:
rds_ib_dev_put(rds_ibdev);
return ret; return ret;
} }
...@@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, ...@@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
struct rds_ib_connection *ic = NULL; struct rds_ib_connection *ic = NULL;
struct rdma_conn_param conn_param; struct rdma_conn_param conn_param;
u32 version; u32 version;
int err, destroy = 1; int err = 1, destroy = 1;
/* Check whether the remote protocol version matches ours. */ /* Check whether the remote protocol version matches ours. */
version = rds_ib_protocol_compatible(event); version = rds_ib_protocol_compatible(event);
...@@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, ...@@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
/* Wait and see - our connect may still be succeeding */ /* Wait and see - our connect may still be succeeding */
rds_ib_stats_inc(s_ib_connect_raced); rds_ib_stats_inc(s_ib_connect_raced);
} }
mutex_unlock(&conn->c_cm_lock);
goto out; goto out;
} }
...@@ -479,20 +525,20 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, ...@@ -479,20 +525,20 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
goto out; goto out;
} }
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
event->param.conn.responder_resources,
event->param.conn.initiator_depth);
/* rdma_accept() calls rdma_reject() internally if it fails */ /* rdma_accept() calls rdma_reject() internally if it fails */
err = rdma_accept(cm_id, &conn_param); err = rdma_accept(cm_id, &conn_param);
mutex_unlock(&conn->c_cm_lock); if (err)
if (err) {
rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
goto out;
}
return 0;
out: out:
rdma_reject(cm_id, NULL, 0); if (conn)
mutex_unlock(&conn->c_cm_lock);
if (err)
rdma_reject(cm_id, NULL, 0);
return destroy; return destroy;
} }
...@@ -516,8 +562,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) ...@@ -516,8 +562,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
goto out; goto out;
} }
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
UINT_MAX, UINT_MAX);
ret = rdma_connect(cm_id, &conn_param); ret = rdma_connect(cm_id, &conn_param);
if (ret) if (ret)
rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
...@@ -601,9 +647,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ...@@ -601,9 +647,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
ic->i_cm_id, err); ic->i_cm_id, err);
} }
/*
* We want to wait for tx and rx completion to finish
* before we tear down the connection, but we have to be
* careful not to get stuck waiting on a send ring that
* only has unsignaled sends in it. We've shutdown new
* sends before getting here so by waiting for signaled
* sends to complete we're ensured that there will be no
* more tx processing.
*/
wait_event(rds_ib_ring_empty_wait, wait_event(rds_ib_ring_empty_wait,
rds_ib_ring_empty(&ic->i_send_ring) && rds_ib_ring_empty(&ic->i_recv_ring) &&
rds_ib_ring_empty(&ic->i_recv_ring)); (atomic_read(&ic->i_signaled_sends) == 0));
tasklet_kill(&ic->i_recv_tasklet);
if (ic->i_send_hdrs) if (ic->i_send_hdrs)
ib_dma_free_coherent(dev, ib_dma_free_coherent(dev,
...@@ -654,9 +710,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ...@@ -654,9 +710,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
BUG_ON(ic->rds_ibdev); BUG_ON(ic->rds_ibdev);
/* Clear pending transmit */ /* Clear pending transmit */
if (ic->i_rm) { if (ic->i_data_op) {
rds_message_put(ic->i_rm); struct rds_message *rm;
ic->i_rm = NULL;
rm = container_of(ic->i_data_op, struct rds_message, data);
rds_message_put(rm);
ic->i_data_op = NULL;
} }
/* Clear the ACK state */ /* Clear the ACK state */
...@@ -690,12 +749,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) ...@@ -690,12 +749,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
{ {
struct rds_ib_connection *ic; struct rds_ib_connection *ic;
unsigned long flags; unsigned long flags;
int ret;
/* XXX too lazy? */ /* XXX too lazy? */
ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
if (ic == NULL) if (!ic)
return -ENOMEM; return -ENOMEM;
ret = rds_ib_recv_alloc_caches(ic);
if (ret) {
kfree(ic);
return ret;
}
INIT_LIST_HEAD(&ic->ib_node); INIT_LIST_HEAD(&ic->ib_node);
tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
(unsigned long) ic); (unsigned long) ic);
...@@ -703,6 +769,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) ...@@ -703,6 +769,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
#ifndef KERNEL_HAS_ATOMIC64 #ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&ic->i_ack_lock); spin_lock_init(&ic->i_ack_lock);
#endif #endif
atomic_set(&ic->i_signaled_sends, 0);
/* /*
* rds_ib_conn_shutdown() waits for these to be emptied so they * rds_ib_conn_shutdown() waits for these to be emptied so they
...@@ -744,6 +811,8 @@ void rds_ib_conn_free(void *arg) ...@@ -744,6 +811,8 @@ void rds_ib_conn_free(void *arg)
list_del(&ic->ib_node); list_del(&ic->ib_node);
spin_unlock_irq(lock_ptr); spin_unlock_irq(lock_ptr);
rds_ib_recv_free_caches(ic);
kfree(ic); kfree(ic);
} }
......
...@@ -32,11 +32,16 @@ ...@@ -32,11 +32,16 @@
*/ */
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/rculist.h>
#include "rds.h" #include "rds.h"
#include "rdma.h"
#include "ib.h" #include "ib.h"
#include "xlist.h"
struct workqueue_struct *rds_ib_fmr_wq;
static DEFINE_PER_CPU(unsigned long, clean_list_grace);
#define CLEAN_LIST_BUSY_BIT 0
/* /*
* This is stored as mr->r_trans_private. * This is stored as mr->r_trans_private.
...@@ -45,7 +50,11 @@ struct rds_ib_mr { ...@@ -45,7 +50,11 @@ struct rds_ib_mr {
struct rds_ib_device *device; struct rds_ib_device *device;
struct rds_ib_mr_pool *pool; struct rds_ib_mr_pool *pool;
struct ib_fmr *fmr; struct ib_fmr *fmr;
struct list_head list;
struct xlist_head xlist;
/* unmap_list is for freeing */
struct list_head unmap_list;
unsigned int remap_count; unsigned int remap_count;
struct scatterlist *sg; struct scatterlist *sg;
...@@ -59,14 +68,16 @@ struct rds_ib_mr { ...@@ -59,14 +68,16 @@ struct rds_ib_mr {
*/ */
struct rds_ib_mr_pool { struct rds_ib_mr_pool {
struct mutex flush_lock; /* serialize fmr invalidate */ struct mutex flush_lock; /* serialize fmr invalidate */
struct work_struct flush_worker; /* flush worker */ struct delayed_work flush_worker; /* flush worker */
spinlock_t list_lock; /* protect variables below */
atomic_t item_count; /* total # of MRs */ atomic_t item_count; /* total # of MRs */
atomic_t dirty_count; /* # dirty of MRs */ atomic_t dirty_count; /* # dirty of MRs */
struct list_head drop_list; /* MRs that have reached their max_maps limit */
struct list_head free_list; /* unused MRs */ struct xlist_head drop_list; /* MRs that have reached their max_maps limit */
struct list_head clean_list; /* unused & unamapped MRs */ struct xlist_head free_list; /* unused MRs */
struct xlist_head clean_list; /* global unused & unamapped MRs */
wait_queue_head_t flush_wait;
atomic_t free_pinned; /* memory pinned by free MRs */ atomic_t free_pinned; /* memory pinned by free MRs */
unsigned long max_items; unsigned long max_items;
unsigned long max_items_soft; unsigned long max_items_soft;
...@@ -74,7 +85,7 @@ struct rds_ib_mr_pool { ...@@ -74,7 +85,7 @@ struct rds_ib_mr_pool {
struct ib_fmr_attr fmr_attr; struct ib_fmr_attr fmr_attr;
}; };
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
static void rds_ib_mr_pool_flush_worker(struct work_struct *work); static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
...@@ -83,16 +94,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) ...@@ -83,16 +94,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
struct rds_ib_device *rds_ibdev; struct rds_ib_device *rds_ibdev;
struct rds_ib_ipaddr *i_ipaddr; struct rds_ib_ipaddr *i_ipaddr;
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { rcu_read_lock();
spin_lock_irq(&rds_ibdev->spinlock); list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
if (i_ipaddr->ipaddr == ipaddr) { if (i_ipaddr->ipaddr == ipaddr) {
spin_unlock_irq(&rds_ibdev->spinlock); atomic_inc(&rds_ibdev->refcount);
rcu_read_unlock();
return rds_ibdev; return rds_ibdev;
} }
} }
spin_unlock_irq(&rds_ibdev->spinlock);
} }
rcu_read_unlock();
return NULL; return NULL;
} }
...@@ -108,7 +120,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) ...@@ -108,7 +120,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
i_ipaddr->ipaddr = ipaddr; i_ipaddr->ipaddr = ipaddr;
spin_lock_irq(&rds_ibdev->spinlock); spin_lock_irq(&rds_ibdev->spinlock);
list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
spin_unlock_irq(&rds_ibdev->spinlock); spin_unlock_irq(&rds_ibdev->spinlock);
return 0; return 0;
...@@ -116,17 +128,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) ...@@ -116,17 +128,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
{ {
struct rds_ib_ipaddr *i_ipaddr, *next; struct rds_ib_ipaddr *i_ipaddr;
struct rds_ib_ipaddr *to_free = NULL;
spin_lock_irq(&rds_ibdev->spinlock); spin_lock_irq(&rds_ibdev->spinlock);
list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
if (i_ipaddr->ipaddr == ipaddr) { if (i_ipaddr->ipaddr == ipaddr) {
list_del(&i_ipaddr->list); list_del_rcu(&i_ipaddr->list);
kfree(i_ipaddr); to_free = i_ipaddr;
break; break;
} }
} }
spin_unlock_irq(&rds_ibdev->spinlock); spin_unlock_irq(&rds_ibdev->spinlock);
if (to_free) {
synchronize_rcu();
kfree(to_free);
}
} }
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
...@@ -134,8 +153,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) ...@@ -134,8 +153,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
struct rds_ib_device *rds_ibdev_old; struct rds_ib_device *rds_ibdev_old;
rds_ibdev_old = rds_ib_get_device(ipaddr); rds_ibdev_old = rds_ib_get_device(ipaddr);
if (rds_ibdev_old) if (rds_ibdev_old) {
rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
rds_ib_dev_put(rds_ibdev_old);
}
return rds_ib_add_ipaddr(rds_ibdev, ipaddr); return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
} }
...@@ -156,6 +177,7 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con ...@@ -156,6 +177,7 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con
spin_unlock_irq(&ib_nodev_conns_lock); spin_unlock_irq(&ib_nodev_conns_lock);
ic->rds_ibdev = rds_ibdev; ic->rds_ibdev = rds_ibdev;
atomic_inc(&rds_ibdev->refcount);
} }
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
...@@ -175,18 +197,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection * ...@@ -175,18 +197,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
spin_unlock(&ib_nodev_conns_lock); spin_unlock(&ib_nodev_conns_lock);
ic->rds_ibdev = NULL; ic->rds_ibdev = NULL;
rds_ib_dev_put(rds_ibdev);
} }
void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock) void rds_ib_destroy_nodev_conns(void)
{ {
struct rds_ib_connection *ic, *_ic; struct rds_ib_connection *ic, *_ic;
LIST_HEAD(tmp_list); LIST_HEAD(tmp_list);
/* avoid calling conn_destroy with irqs off */ /* avoid calling conn_destroy with irqs off */
spin_lock_irq(list_lock); spin_lock_irq(&ib_nodev_conns_lock);
list_splice(list, &tmp_list); list_splice(&ib_nodev_conns, &tmp_list);
INIT_LIST_HEAD(list); spin_unlock_irq(&ib_nodev_conns_lock);
spin_unlock_irq(list_lock);
list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
rds_conn_destroy(ic->conn); rds_conn_destroy(ic->conn);
...@@ -200,12 +222,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) ...@@ -200,12 +222,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
if (!pool) if (!pool)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&pool->free_list); INIT_XLIST_HEAD(&pool->free_list);
INIT_LIST_HEAD(&pool->drop_list); INIT_XLIST_HEAD(&pool->drop_list);
INIT_LIST_HEAD(&pool->clean_list); INIT_XLIST_HEAD(&pool->clean_list);
mutex_init(&pool->flush_lock); mutex_init(&pool->flush_lock);
spin_lock_init(&pool->list_lock); init_waitqueue_head(&pool->flush_wait);
INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
pool->fmr_attr.max_pages = fmr_message_size; pool->fmr_attr.max_pages = fmr_message_size;
pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
...@@ -233,34 +255,60 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co ...@@ -233,34 +255,60 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
{ {
flush_workqueue(rds_wq); cancel_delayed_work_sync(&pool->flush_worker);
rds_ib_flush_mr_pool(pool, 1); rds_ib_flush_mr_pool(pool, 1, NULL);
WARN_ON(atomic_read(&pool->item_count)); WARN_ON(atomic_read(&pool->item_count));
WARN_ON(atomic_read(&pool->free_pinned)); WARN_ON(atomic_read(&pool->free_pinned));
kfree(pool); kfree(pool);
} }
static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl,
struct rds_ib_mr **ibmr_ret)
{
struct xlist_head *ibmr_xl;
ibmr_xl = xlist_del_head_fast(xl);
*ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist);
}
static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
{ {
struct rds_ib_mr *ibmr = NULL; struct rds_ib_mr *ibmr = NULL;
unsigned long flags; struct xlist_head *ret;
unsigned long *flag;
spin_lock_irqsave(&pool->list_lock, flags); preempt_disable();
if (!list_empty(&pool->clean_list)) { flag = &__get_cpu_var(clean_list_grace);
ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); set_bit(CLEAN_LIST_BUSY_BIT, flag);
list_del_init(&ibmr->list); ret = xlist_del_head(&pool->clean_list);
} if (ret)
spin_unlock_irqrestore(&pool->list_lock, flags); ibmr = list_entry(ret, struct rds_ib_mr, xlist);
clear_bit(CLEAN_LIST_BUSY_BIT, flag);
preempt_enable();
return ibmr; return ibmr;
} }
static inline void wait_clean_list_grace(void)
{
int cpu;
unsigned long *flag;
for_each_online_cpu(cpu) {
flag = &per_cpu(clean_list_grace, cpu);
while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
cpu_relax();
}
}
static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
{ {
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
struct rds_ib_mr *ibmr = NULL; struct rds_ib_mr *ibmr = NULL;
int err = 0, iter = 0; int err = 0, iter = 0;
if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
while (1) { while (1) {
ibmr = rds_ib_reuse_fmr(pool); ibmr = rds_ib_reuse_fmr(pool);
if (ibmr) if (ibmr)
...@@ -287,19 +335,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) ...@@ -287,19 +335,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
/* We do have some empty MRs. Flush them out. */ /* We do have some empty MRs. Flush them out. */
rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
rds_ib_flush_mr_pool(pool, 0); rds_ib_flush_mr_pool(pool, 0, &ibmr);
if (ibmr)
return ibmr;
} }
ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
if (!ibmr) { if (!ibmr) {
err = -ENOMEM; err = -ENOMEM;
goto out_no_cigar; goto out_no_cigar;
} }
memset(ibmr, 0, sizeof(*ibmr));
ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
(IB_ACCESS_LOCAL_WRITE | (IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE), IB_ACCESS_REMOTE_WRITE|
IB_ACCESS_REMOTE_ATOMIC),
&pool->fmr_attr); &pool->fmr_attr);
if (IS_ERR(ibmr->fmr)) { if (IS_ERR(ibmr->fmr)) {
err = PTR_ERR(ibmr->fmr); err = PTR_ERR(ibmr->fmr);
...@@ -367,7 +420,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm ...@@ -367,7 +420,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
if (page_cnt > fmr_message_size) if (page_cnt > fmr_message_size)
return -EINVAL; return -EINVAL;
dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
rdsibdev_to_node(rds_ibdev));
if (!dma_pages) if (!dma_pages)
return -ENOMEM; return -ENOMEM;
...@@ -441,7 +495,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) ...@@ -441,7 +495,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
/* FIXME we need a way to tell a r/w MR /* FIXME we need a way to tell a r/w MR
* from a r/o MR */ * from a r/o MR */
BUG_ON(in_interrupt()); BUG_ON(irqs_disabled());
set_page_dirty(page); set_page_dirty(page);
put_page(page); put_page(page);
} }
...@@ -476,34 +530,110 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr ...@@ -476,34 +530,110 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr
return 0; return 0;
} }
/*
* given an xlist of mrs, put them all into the list_head for more processing
*/
static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list)
{
struct rds_ib_mr *ibmr;
struct xlist_head splice;
struct xlist_head *cur;
struct xlist_head *next;
splice.next = NULL;
xlist_splice(xlist, &splice);
cur = splice.next;
while (cur) {
next = cur->next;
ibmr = list_entry(cur, struct rds_ib_mr, xlist);
list_add_tail(&ibmr->unmap_list, list);
cur = next;
}
}
/*
* this takes a list head of mrs and turns it into an xlist of clusters.
* each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
* reuse.
*/
static void list_append_to_xlist(struct rds_ib_mr_pool *pool,
struct list_head *list, struct xlist_head *xlist,
struct xlist_head **tail_ret)
{
struct rds_ib_mr *ibmr;
struct xlist_head *cur_mr = xlist;
struct xlist_head *tail_mr = NULL;
list_for_each_entry(ibmr, list, unmap_list) {
tail_mr = &ibmr->xlist;
tail_mr->next = NULL;
cur_mr->next = tail_mr;
cur_mr = tail_mr;
}
*tail_ret = tail_mr;
}
/* /*
* Flush our pool of MRs. * Flush our pool of MRs.
* At a minimum, all currently unused MRs are unmapped. * At a minimum, all currently unused MRs are unmapped.
* If the number of MRs allocated exceeds the limit, we also try * If the number of MRs allocated exceeds the limit, we also try
* to free as many MRs as needed to get back to this limit. * to free as many MRs as needed to get back to this limit.
*/ */
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
int free_all, struct rds_ib_mr **ibmr_ret)
{ {
struct rds_ib_mr *ibmr, *next; struct rds_ib_mr *ibmr, *next;
struct xlist_head clean_xlist;
struct xlist_head *clean_tail;
LIST_HEAD(unmap_list); LIST_HEAD(unmap_list);
LIST_HEAD(fmr_list); LIST_HEAD(fmr_list);
unsigned long unpinned = 0; unsigned long unpinned = 0;
unsigned long flags;
unsigned int nfreed = 0, ncleaned = 0, free_goal; unsigned int nfreed = 0, ncleaned = 0, free_goal;
int ret = 0; int ret = 0;
rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
mutex_lock(&pool->flush_lock); if (ibmr_ret) {
DEFINE_WAIT(wait);
while(!mutex_trylock(&pool->flush_lock)) {
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
finish_wait(&pool->flush_wait, &wait);
goto out_nolock;
}
prepare_to_wait(&pool->flush_wait, &wait,
TASK_UNINTERRUPTIBLE);
if (xlist_empty(&pool->clean_list))
schedule();
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
finish_wait(&pool->flush_wait, &wait);
goto out_nolock;
}
}
finish_wait(&pool->flush_wait, &wait);
} else
mutex_lock(&pool->flush_lock);
if (ibmr_ret) {
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
goto out;
}
}
spin_lock_irqsave(&pool->list_lock, flags);
/* Get the list of all MRs to be dropped. Ordering matters - /* Get the list of all MRs to be dropped. Ordering matters -
* we want to put drop_list ahead of free_list. */ * we want to put drop_list ahead of free_list.
list_splice_init(&pool->free_list, &unmap_list); */
list_splice_init(&pool->drop_list, &unmap_list); xlist_append_to_list(&pool->drop_list, &unmap_list);
xlist_append_to_list(&pool->free_list, &unmap_list);
if (free_all) if (free_all)
list_splice_init(&pool->clean_list, &unmap_list); xlist_append_to_list(&pool->clean_list, &unmap_list);
spin_unlock_irqrestore(&pool->list_lock, flags);
free_goal = rds_ib_flush_goal(pool, free_all); free_goal = rds_ib_flush_goal(pool, free_all);
...@@ -511,19 +641,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) ...@@ -511,19 +641,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
goto out; goto out;
/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
list_for_each_entry(ibmr, &unmap_list, list) list_for_each_entry(ibmr, &unmap_list, unmap_list)
list_add(&ibmr->fmr->list, &fmr_list); list_add(&ibmr->fmr->list, &fmr_list);
ret = ib_unmap_fmr(&fmr_list); ret = ib_unmap_fmr(&fmr_list);
if (ret) if (ret)
printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
/* Now we can destroy the DMA mapping and unpin any pages */ /* Now we can destroy the DMA mapping and unpin any pages */
list_for_each_entry_safe(ibmr, next, &unmap_list, list) { list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
unpinned += ibmr->sg_len; unpinned += ibmr->sg_len;
__rds_ib_teardown_mr(ibmr); __rds_ib_teardown_mr(ibmr);
if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
rds_ib_stats_inc(s_ib_rdma_mr_free); rds_ib_stats_inc(s_ib_rdma_mr_free);
list_del(&ibmr->list); list_del(&ibmr->unmap_list);
ib_dealloc_fmr(ibmr->fmr); ib_dealloc_fmr(ibmr->fmr);
kfree(ibmr); kfree(ibmr);
nfreed++; nfreed++;
...@@ -531,9 +662,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) ...@@ -531,9 +662,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
ncleaned++; ncleaned++;
} }
spin_lock_irqsave(&pool->list_lock, flags); if (!list_empty(&unmap_list)) {
list_splice(&unmap_list, &pool->clean_list); /* we have to make sure that none of the things we're about
spin_unlock_irqrestore(&pool->list_lock, flags); * to put on the clean list would race with other cpus trying
* to pull items off. The xlist would explode if we managed to
* remove something from the clean list and then add it back again
* while another CPU was spinning on that same item in xlist_del_head.
*
* This is pretty unlikely, but just in case wait for an xlist grace period
* here before adding anything back into the clean list.
*/
wait_clean_list_grace();
list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail);
if (ibmr_ret)
refill_local(pool, &clean_xlist, ibmr_ret);
/* refill_local may have emptied our list */
if (!xlist_empty(&clean_xlist))
xlist_add(clean_xlist.next, clean_tail, &pool->clean_list);
}
atomic_sub(unpinned, &pool->free_pinned); atomic_sub(unpinned, &pool->free_pinned);
atomic_sub(ncleaned, &pool->dirty_count); atomic_sub(ncleaned, &pool->dirty_count);
...@@ -541,14 +690,35 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) ...@@ -541,14 +690,35 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
out: out:
mutex_unlock(&pool->flush_lock); mutex_unlock(&pool->flush_lock);
if (waitqueue_active(&pool->flush_wait))
wake_up(&pool->flush_wait);
out_nolock:
return ret; return ret;
} }
int rds_ib_fmr_init(void)
{
rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
if (!rds_ib_fmr_wq)
return -ENOMEM;
return 0;
}
/*
* By the time this is called all the IB devices should have been torn down and
* had their pools freed. As each pool is freed its work struct is waited on,
* so the pool flushing work queue should be idle by the time we get here.
*/
void rds_ib_fmr_exit(void)
{
destroy_workqueue(rds_ib_fmr_wq);
}
static void rds_ib_mr_pool_flush_worker(struct work_struct *work) static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
{ {
struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
rds_ib_flush_mr_pool(pool, 0); rds_ib_flush_mr_pool(pool, 0, NULL);
} }
void rds_ib_free_mr(void *trans_private, int invalidate) void rds_ib_free_mr(void *trans_private, int invalidate)
...@@ -556,47 +726,49 @@ void rds_ib_free_mr(void *trans_private, int invalidate) ...@@ -556,47 +726,49 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
struct rds_ib_mr *ibmr = trans_private; struct rds_ib_mr *ibmr = trans_private;
struct rds_ib_device *rds_ibdev = ibmr->device; struct rds_ib_device *rds_ibdev = ibmr->device;
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
unsigned long flags;
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
/* Return it to the pool's free list */ /* Return it to the pool's free list */
spin_lock_irqsave(&pool->list_lock, flags);
if (ibmr->remap_count >= pool->fmr_attr.max_maps) if (ibmr->remap_count >= pool->fmr_attr.max_maps)
list_add(&ibmr->list, &pool->drop_list); xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list);
else else
list_add(&ibmr->list, &pool->free_list); xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list);
atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_add(ibmr->sg_len, &pool->free_pinned);
atomic_inc(&pool->dirty_count); atomic_inc(&pool->dirty_count);
spin_unlock_irqrestore(&pool->list_lock, flags);
/* If we've pinned too many pages, request a flush */ /* If we've pinned too many pages, request a flush */
if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
atomic_read(&pool->dirty_count) >= pool->max_items / 10) atomic_read(&pool->dirty_count) >= pool->max_items / 10)
queue_work(rds_wq, &pool->flush_worker); queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
if (invalidate) { if (invalidate) {
if (likely(!in_interrupt())) { if (likely(!in_interrupt())) {
rds_ib_flush_mr_pool(pool, 0); rds_ib_flush_mr_pool(pool, 0, NULL);
} else { } else {
/* We get here if the user created a MR marked /* We get here if the user created a MR marked
* as use_once and invalidate at the same time. */ * as use_once and invalidate at the same time. */
queue_work(rds_wq, &pool->flush_worker); queue_delayed_work(rds_ib_fmr_wq,
&pool->flush_worker, 10);
} }
} }
rds_ib_dev_put(rds_ibdev);
} }
void rds_ib_flush_mrs(void) void rds_ib_flush_mrs(void)
{ {
struct rds_ib_device *rds_ibdev; struct rds_ib_device *rds_ibdev;
down_read(&rds_ib_devices_lock);
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
if (pool) if (pool)
rds_ib_flush_mr_pool(pool, 0); rds_ib_flush_mr_pool(pool, 0, NULL);
} }
up_read(&rds_ib_devices_lock);
} }
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
...@@ -628,6 +800,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, ...@@ -628,6 +800,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
ibmr->device = rds_ibdev; ibmr->device = rds_ibdev;
rds_ibdev = NULL;
out: out:
if (ret) { if (ret) {
...@@ -635,5 +808,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, ...@@ -635,5 +808,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
rds_ib_free_mr(ibmr, 0); rds_ib_free_mr(ibmr, 0);
ibmr = ERR_PTR(ret); ibmr = ERR_PTR(ret);
} }
if (rds_ibdev)
rds_ib_dev_put(rds_ibdev);
return ibmr; return ibmr;
} }
...@@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab; ...@@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab;
static struct kmem_cache *rds_ib_frag_slab; static struct kmem_cache *rds_ib_frag_slab;
static atomic_t rds_ib_allocation = ATOMIC_INIT(0); static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
{
rdsdebug("frag %p page %p\n", frag, frag->f_page);
__free_page(frag->f_page);
frag->f_page = NULL;
}
static void rds_ib_frag_free(struct rds_page_frag *frag)
{
rdsdebug("frag %p page %p\n", frag, frag->f_page);
BUG_ON(frag->f_page != NULL);
kmem_cache_free(rds_ib_frag_slab, frag);
}
/*
* We map a page at a time. Its fragments are posted in order. This
* is called in fragment order as the fragments get send completion events.
* Only the last frag in the page performs the unmapping.
*
* It's OK for ring cleanup to call this in whatever order it likes because
* DMA is not in flight and so we can unmap while other ring entries still
* hold page references in their frags.
*/
static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
struct rds_ib_recv_work *recv)
{
struct rds_page_frag *frag = recv->r_frag;
rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
if (frag->f_mapped)
ib_dma_unmap_page(ic->i_cm_id->device,
frag->f_mapped,
RDS_FRAG_SIZE, DMA_FROM_DEVICE);
frag->f_mapped = 0;
}
void rds_ib_recv_init_ring(struct rds_ib_connection *ic) void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
{ {
struct rds_ib_recv_work *recv; struct rds_ib_recv_work *recv;
...@@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) ...@@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
recv->r_wr.sg_list = recv->r_sge; recv->r_wr.sg_list = recv->r_sge;
recv->r_wr.num_sge = RDS_IB_RECV_SGE; recv->r_wr.num_sge = RDS_IB_RECV_SGE;
sge = rds_ib_data_sge(ic, recv->r_sge); sge = &recv->r_sge[0];
sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
sge->length = sizeof(struct rds_header);
sge->lkey = ic->i_mr->lkey;
sge = &recv->r_sge[1];
sge->addr = 0; sge->addr = 0;
sge->length = RDS_FRAG_SIZE; sge->length = RDS_FRAG_SIZE;
sge->lkey = ic->i_mr->lkey; sge->lkey = ic->i_mr->lkey;
}
}
sge = rds_ib_header_sge(ic, recv->r_sge); /*
sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); * The entire 'from' list, including the from element itself, is put on
sge->length = sizeof(struct rds_header); * to the tail of the 'to' list.
sge->lkey = ic->i_mr->lkey; */
static void list_splice_entire_tail(struct list_head *from,
struct list_head *to)
{
struct list_head *from_last = from->prev;
list_splice_tail(from_last, to);
list_add_tail(from_last, to);
}
static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
{
struct list_head *tmp;
tmp = xchg(&cache->xfer, NULL);
if (tmp) {
if (cache->ready)
list_splice_entire_tail(tmp, cache->ready);
else
cache->ready = tmp;
}
}
static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
{
struct rds_ib_cache_head *head;
int cpu;
cache->percpu = alloc_percpu(struct rds_ib_cache_head);
if (!cache->percpu)
return -ENOMEM;
for_each_possible_cpu(cpu) {
head = per_cpu_ptr(cache->percpu, cpu);
head->first = NULL;
head->count = 0;
}
cache->xfer = NULL;
cache->ready = NULL;
return 0;
}
int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
{
int ret;
ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
if (!ret) {
ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
if (ret)
free_percpu(ic->i_cache_incs.percpu);
} }
return ret;
}
static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
struct list_head *caller_list)
{
struct rds_ib_cache_head *head;
int cpu;
for_each_possible_cpu(cpu) {
head = per_cpu_ptr(cache->percpu, cpu);
if (head->first) {
list_splice_entire_tail(head->first, caller_list);
head->first = NULL;
}
}
if (cache->ready) {
list_splice_entire_tail(cache->ready, caller_list);
cache->ready = NULL;
}
}
void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
{
struct rds_ib_incoming *inc;
struct rds_ib_incoming *inc_tmp;
struct rds_page_frag *frag;
struct rds_page_frag *frag_tmp;
LIST_HEAD(list);
rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
free_percpu(ic->i_cache_incs.percpu);
list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
list_del(&inc->ii_cache_entry);
WARN_ON(!list_empty(&inc->ii_frags));
kmem_cache_free(rds_ib_incoming_slab, inc);
}
rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
free_percpu(ic->i_cache_frags.percpu);
list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
list_del(&frag->f_cache_entry);
WARN_ON(!list_empty(&frag->f_item));
kmem_cache_free(rds_ib_frag_slab, frag);
}
}
/* fwd decl */
static void rds_ib_recv_cache_put(struct list_head *new_item,
struct rds_ib_refill_cache *cache);
static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
/* Recycle frag and attached recv buffer f_sg */
static void rds_ib_frag_free(struct rds_ib_connection *ic,
struct rds_page_frag *frag)
{
rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
}
/* Recycle inc after freeing attached frags */
void rds_ib_inc_free(struct rds_incoming *inc)
{
struct rds_ib_incoming *ibinc;
struct rds_page_frag *frag;
struct rds_page_frag *pos;
struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
/* Free attached frags */
list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
list_del_init(&frag->f_item);
rds_ib_frag_free(ic, frag);
}
BUG_ON(!list_empty(&ibinc->ii_frags));
rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
} }
static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
...@@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, ...@@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
recv->r_ibinc = NULL; recv->r_ibinc = NULL;
} }
if (recv->r_frag) { if (recv->r_frag) {
rds_ib_recv_unmap_page(ic, recv); ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
if (recv->r_frag->f_page) rds_ib_frag_free(ic, recv->r_frag);
rds_ib_frag_drop_page(recv->r_frag);
rds_ib_frag_free(recv->r_frag);
recv->r_frag = NULL; recv->r_frag = NULL;
} }
} }
...@@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) ...@@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
for (i = 0; i < ic->i_recv_ring.w_nr; i++) for (i = 0; i < ic->i_recv_ring.w_nr; i++)
rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
if (ic->i_frag.f_page)
rds_ib_frag_drop_page(&ic->i_frag);
} }
static int rds_ib_recv_refill_one(struct rds_connection *conn, static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
struct rds_ib_recv_work *recv, gfp_t slab_mask)
gfp_t kptr_gfp, gfp_t page_gfp)
{ {
struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_incoming *ibinc;
dma_addr_t dma_addr; struct list_head *cache_item;
struct ib_sge *sge; int avail_allocs;
int ret = -ENOMEM;
if (recv->r_ibinc == NULL) { cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { if (cache_item) {
ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
} else {
avail_allocs = atomic_add_unless(&rds_ib_allocation,
1, rds_ib_sysctl_max_recv_allocation);
if (!avail_allocs) {
rds_ib_stats_inc(s_ib_rx_alloc_limit); rds_ib_stats_inc(s_ib_rx_alloc_limit);
goto out; return NULL;
} }
recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
kptr_gfp); if (!ibinc) {
if (recv->r_ibinc == NULL) {
atomic_dec(&rds_ib_allocation); atomic_dec(&rds_ib_allocation);
goto out; return NULL;
} }
INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
} }
INIT_LIST_HEAD(&ibinc->ii_frags);
rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
if (recv->r_frag == NULL) { return ibinc;
recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); }
if (recv->r_frag == NULL)
goto out; static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
INIT_LIST_HEAD(&recv->r_frag->f_item); gfp_t slab_mask, gfp_t page_mask)
recv->r_frag->f_page = NULL; {
struct rds_page_frag *frag;
struct list_head *cache_item;
int ret;
cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
if (cache_item) {
frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
} else {
frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
if (!frag)
return NULL;
sg_init_table(&frag->f_sg, 1);
ret = rds_page_remainder_alloc(&frag->f_sg,
RDS_FRAG_SIZE, page_mask);
if (ret) {
kmem_cache_free(rds_ib_frag_slab, frag);
return NULL;
}
} }
if (ic->i_frag.f_page == NULL) { INIT_LIST_HEAD(&frag->f_item);
ic->i_frag.f_page = alloc_page(page_gfp);
if (ic->i_frag.f_page == NULL) return frag;
goto out; }
ic->i_frag.f_offset = 0;
static int rds_ib_recv_refill_one(struct rds_connection *conn,
struct rds_ib_recv_work *recv, int prefill)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct ib_sge *sge;
int ret = -ENOMEM;
gfp_t slab_mask = GFP_NOWAIT;
gfp_t page_mask = GFP_NOWAIT;
if (prefill) {
slab_mask = GFP_KERNEL;
page_mask = GFP_HIGHUSER;
} }
dma_addr = ib_dma_map_page(ic->i_cm_id->device, if (!ic->i_cache_incs.ready)
ic->i_frag.f_page, rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
ic->i_frag.f_offset, if (!ic->i_cache_frags.ready)
RDS_FRAG_SIZE, rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
DMA_FROM_DEVICE);
if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
goto out;
/* /*
* Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() * ibinc was taken from recv if recv contained the start of a message.
* must be called on this recv. This happens as completions hit * recvs that were continuations will still have this allocated.
* in order or on connection shutdown.
*/ */
recv->r_frag->f_page = ic->i_frag.f_page; if (!recv->r_ibinc) {
recv->r_frag->f_offset = ic->i_frag.f_offset; recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
recv->r_frag->f_mapped = dma_addr; if (!recv->r_ibinc)
goto out;
}
sge = rds_ib_data_sge(ic, recv->r_sge); WARN_ON(recv->r_frag); /* leak! */
sge->addr = dma_addr; recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
sge->length = RDS_FRAG_SIZE; if (!recv->r_frag)
goto out;
ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
1, DMA_FROM_DEVICE);
WARN_ON(ret != 1);
sge = rds_ib_header_sge(ic, recv->r_sge); sge = &recv->r_sge[0];
sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
sge->length = sizeof(struct rds_header); sge->length = sizeof(struct rds_header);
get_page(recv->r_frag->f_page); sge = &recv->r_sge[1];
sge->addr = sg_dma_address(&recv->r_frag->f_sg);
if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { sge->length = sg_dma_len(&recv->r_frag->f_sg);
ic->i_frag.f_offset += RDS_FRAG_SIZE;
} else {
put_page(ic->i_frag.f_page);
ic->i_frag.f_page = NULL;
ic->i_frag.f_offset = 0;
}
ret = 0; ret = 0;
out: out:
...@@ -216,13 +350,11 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, ...@@ -216,13 +350,11 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
/* /*
* This tries to allocate and post unused work requests after making sure that * This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into * they have all the allocations they need to queue received fragments into
* sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc * sockets.
* pairs don't go unmatched.
* *
* -1 is returned if posting fails due to temporary resource exhaustion. * -1 is returned if posting fails due to temporary resource exhaustion.
*/ */
int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
gfp_t page_gfp, int prefill)
{ {
struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_recv_work *recv; struct rds_ib_recv_work *recv;
...@@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, ...@@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
if (pos >= ic->i_recv_ring.w_nr) { if (pos >= ic->i_recv_ring.w_nr) {
printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
pos); pos);
ret = -EINVAL;
break; break;
} }
recv = &ic->i_recvs[pos]; recv = &ic->i_recvs[pos];
ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); ret = rds_ib_recv_refill_one(conn, recv, prefill);
if (ret) { if (ret) {
ret = -1;
break; break;
} }
/* XXX when can this fail? */ /* XXX when can this fail? */
ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
recv->r_ibinc, recv->r_frag->f_page, recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
(long) recv->r_frag->f_mapped, ret); (long) sg_dma_address(&recv->r_frag->f_sg), ret);
if (ret) { if (ret) {
rds_ib_conn_error(conn, "recv post on " rds_ib_conn_error(conn, "recv post on "
"%pI4 returned %d, disconnecting and " "%pI4 returned %d, disconnecting and "
"reconnecting\n", &conn->c_faddr, "reconnecting\n", &conn->c_faddr,
ret); ret);
ret = -1;
break; break;
} }
...@@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, ...@@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
if (ret) if (ret)
rds_ib_ring_unalloc(&ic->i_recv_ring, 1); rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
return ret;
} }
void rds_ib_inc_purge(struct rds_incoming *inc) /*
* We want to recycle several types of recv allocations, like incs and frags.
* To use this, the *_free() function passes in the ptr to a list_head within
* the recyclee, as well as the cache to put it on.
*
* First, we put the memory on a percpu list. When this reaches a certain size,
* We move it to an intermediate non-percpu list in a lockless manner, with some
* xchg/compxchg wizardry.
*
* N.B. Instead of a list_head as the anchor, we use a single pointer, which can
* be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
* list_empty() will return true with one element is actually present.
*/
static void rds_ib_recv_cache_put(struct list_head *new_item,
struct rds_ib_refill_cache *cache)
{ {
struct rds_ib_incoming *ibinc; unsigned long flags;
struct rds_page_frag *frag; struct rds_ib_cache_head *chp;
struct rds_page_frag *pos; struct list_head *old;
ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); local_irq_save(flags);
rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { chp = per_cpu_ptr(cache->percpu, smp_processor_id());
list_del_init(&frag->f_item); if (!chp->first)
rds_ib_frag_drop_page(frag); INIT_LIST_HEAD(new_item);
rds_ib_frag_free(frag); else /* put on front */
} list_add_tail(new_item, chp->first);
chp->first = new_item;
chp->count++;
if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT)
goto end;
/*
* Return our per-cpu first list to the cache's xfer by atomically
* grabbing the current xfer list, appending it to our per-cpu list,
* and then atomically returning that entire list back to the
* cache's xfer list as long as it's still empty.
*/
do {
old = xchg(&cache->xfer, NULL);
if (old)
list_splice_entire_tail(old, chp->first);
old = cmpxchg(&cache->xfer, NULL, chp->first);
} while (old);
chp->first = NULL;
chp->count = 0;
end:
local_irq_restore(flags);
} }
void rds_ib_inc_free(struct rds_incoming *inc) static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
{ {
struct rds_ib_incoming *ibinc; struct list_head *head = cache->ready;
ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); if (head) {
if (!list_empty(head)) {
cache->ready = head->next;
list_del_init(head);
} else
cache->ready = NULL;
}
rds_ib_inc_purge(inc); return head;
rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
BUG_ON(!list_empty(&ibinc->ii_frags));
kmem_cache_free(rds_ib_incoming_slab, ibinc);
atomic_dec(&rds_ib_allocation);
BUG_ON(atomic_read(&rds_ib_allocation) < 0);
} }
int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
...@@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, ...@@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
to_copy = min_t(unsigned long, to_copy, len - copied); to_copy = min_t(unsigned long, to_copy, len - copied);
rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
"[%p, %lu] + %lu\n", "[%p, %u] + %lu\n",
to_copy, iov->iov_base, iov->iov_len, iov_off, to_copy, iov->iov_base, iov->iov_len, iov_off,
frag->f_page, frag->f_offset, frag_off); sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
/* XXX needs + offset for multiple recvs per page */ /* XXX needs + offset for multiple recvs per page */
ret = rds_page_copy_to_user(frag->f_page, ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
frag->f_offset + frag_off, frag->f_sg.offset + frag_off,
iov->iov_base + iov_off, iov->iov_base + iov_off,
to_copy); to_copy);
if (ret) { if (ret) {
...@@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) ...@@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
return rds_ib_get_ack(ic); return rds_ib_get_ack(ic);
} }
static struct rds_header *rds_ib_get_header(struct rds_connection *conn,
struct rds_ib_recv_work *recv,
u32 data_len)
{
struct rds_ib_connection *ic = conn->c_transport_data;
void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs];
void *addr;
u32 misplaced_hdr_bytes;
/*
* Support header at the front (RDS 3.1+) as well as header-at-end.
*
* Cases:
* 1) header all in header buff (great!)
* 2) header all in data page (copy all to header buff)
* 3) header split across hdr buf + data page
* (move bit in hdr buff to end before copying other bit from data page)
*/
if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE)
return hdr_buff;
if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) {
addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
memcpy(hdr_buff,
addr + recv->r_frag->f_offset + data_len,
sizeof(struct rds_header));
kunmap_atomic(addr, KM_SOFTIRQ0);
return hdr_buff;
}
misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len));
memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes);
addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0);
memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len,
sizeof(struct rds_header) - misplaced_hdr_bytes);
kunmap_atomic(addr, KM_SOFTIRQ0);
return hdr_buff;
}
/* /*
* It's kind of lame that we're copying from the posted receive pages into * It's kind of lame that we're copying from the posted receive pages into
* long-lived bitmaps. We could have posted the bitmaps and rdma written into * long-lived bitmaps. We could have posted the bitmaps and rdma written into
...@@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, ...@@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0);
src = addr + frag_off; src = addr + frag_off;
dst = (void *)map->m_page_addrs[map_page] + map_off; dst = (void *)map->m_page_addrs[map_page] + map_off;
...@@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, ...@@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
} }
data_len -= sizeof(struct rds_header); data_len -= sizeof(struct rds_header);
ihdr = rds_ib_get_header(conn, recv, data_len); ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
/* Validate the checksum. */ /* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) { if (!rds_message_verify_checksum(ihdr)) {
...@@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn, ...@@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
* the inc is freed. We don't go that route, so we have to drop the * the inc is freed. We don't go that route, so we have to drop the
* page ref ourselves. We can't just leave the page on the recv * page ref ourselves. We can't just leave the page on the recv
* because that confuses the dma mapping of pages and each recv's use * because that confuses the dma mapping of pages and each recv's use
* of a partial page. We can leave the frag, though, it will be * of a partial page.
* reused.
* *
* FIXME: Fold this into the code path below. * FIXME: Fold this into the code path below.
*/ */
rds_ib_frag_drop_page(recv->r_frag); rds_ib_frag_free(ic, recv->r_frag);
recv->r_frag = NULL;
return; return;
} }
...@@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, ...@@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
* into the inc and save the inc so we can hang upcoming fragments * into the inc and save the inc so we can hang upcoming fragments
* off its list. * off its list.
*/ */
if (ibinc == NULL) { if (!ibinc) {
ibinc = recv->r_ibinc; ibinc = recv->r_ibinc;
recv->r_ibinc = NULL; recv->r_ibinc = NULL;
ic->i_ibinc = ibinc; ic->i_ibinc = ibinc;
...@@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, ...@@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
struct rds_ib_recv_work *recv; struct rds_ib_recv_work *recv;
while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
(unsigned long long)wc.wr_id, wc.status, wc.byte_len, (unsigned long long)wc.wr_id, wc.status,
rds_ib_wc_status_str(wc.status), wc.byte_len,
be32_to_cpu(wc.ex.imm_data)); be32_to_cpu(wc.ex.imm_data));
rds_ib_stats_inc(s_ib_rx_cq_event); rds_ib_stats_inc(s_ib_rx_cq_event);
recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
rds_ib_recv_unmap_page(ic, recv); ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
/* /*
* Also process recvs in connecting state because it is possible * Also process recvs in connecting state because it is possible
* to get a recv completion _before_ the rdmacm ESTABLISHED * to get a recv completion _before_ the rdmacm ESTABLISHED
* event is processed. * event is processed.
*/ */
if (rds_conn_up(conn) || rds_conn_connecting(conn)) { if (wc.status == IB_WC_SUCCESS) {
rds_ib_process_recv(conn, recv, wc.byte_len, state);
} else {
/* We expect errors as the qp is drained during shutdown */ /* We expect errors as the qp is drained during shutdown */
if (wc.status == IB_WC_SUCCESS) { if (rds_conn_up(conn) || rds_conn_connecting(conn))
rds_ib_process_recv(conn, recv, wc.byte_len, state); rds_ib_conn_error(conn, "recv completion on %pI4 had "
} else { "status %u (%s), disconnecting and "
rds_ib_conn_error(conn, "recv completion on " "reconnecting\n", &conn->c_faddr,
"%pI4 had status %u, disconnecting and " wc.status,
"reconnecting\n", &conn->c_faddr, rds_ib_wc_status_str(wc.status));
wc.status);
}
} }
/*
* It's very important that we only free this ring entry if we've truly
* freed the resources allocated to the entry. The refilling path can
* leak if we don't.
*/
rds_ib_ring_free(&ic->i_recv_ring, 1); rds_ib_ring_free(&ic->i_recv_ring, 1);
} }
} }
...@@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data) ...@@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
if (rds_ib_ring_empty(&ic->i_recv_ring)) if (rds_ib_ring_empty(&ic->i_recv_ring))
rds_ib_stats_inc(s_ib_rx_ring_empty); rds_ib_stats_inc(s_ib_rx_ring_empty);
/*
* If the ring is running low, then schedule the thread to refill.
*/
if (rds_ib_ring_low(&ic->i_recv_ring)) if (rds_ib_ring_low(&ic->i_recv_ring))
queue_delayed_work(rds_wq, &conn->c_recv_w, 0); rds_ib_recv_refill(conn, 0);
} }
int rds_ib_recv(struct rds_connection *conn) int rds_ib_recv(struct rds_connection *conn)
...@@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn) ...@@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn)
int ret = 0; int ret = 0;
rdsdebug("conn %p\n", conn); rdsdebug("conn %p\n", conn);
/*
* If we get a temporary posting failure in this context then
* we're really low and we want the caller to back off for a bit.
*/
mutex_lock(&ic->i_recv_mutex);
if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
ret = -ENOMEM;
else
rds_ib_stats_inc(s_ib_rx_refill_from_thread);
mutex_unlock(&ic->i_recv_mutex);
if (rds_conn_up(conn)) if (rds_conn_up(conn))
rds_ib_attempt_ack(ic); rds_ib_attempt_ack(ic);
return ret; return ret;
} }
int __init rds_ib_recv_init(void) int rds_ib_recv_init(void)
{ {
struct sysinfo si; struct sysinfo si;
int ret = -ENOMEM; int ret = -ENOMEM;
...@@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void) ...@@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void)
rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
sizeof(struct rds_ib_incoming), sizeof(struct rds_ib_incoming),
0, 0, NULL); 0, SLAB_HWCACHE_ALIGN, NULL);
if (rds_ib_incoming_slab == NULL) if (!rds_ib_incoming_slab)
goto out; goto out;
rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
sizeof(struct rds_page_frag), sizeof(struct rds_page_frag),
0, 0, NULL); 0, SLAB_HWCACHE_ALIGN, NULL);
if (rds_ib_frag_slab == NULL) if (!rds_ib_frag_slab)
kmem_cache_destroy(rds_ib_incoming_slab); kmem_cache_destroy(rds_ib_incoming_slab);
else else
ret = 0; ret = 0;
......
...@@ -36,11 +36,49 @@ ...@@ -36,11 +36,49 @@
#include <linux/dmapool.h> #include <linux/dmapool.h>
#include "rds.h" #include "rds.h"
#include "rdma.h"
#include "ib.h" #include "ib.h"
static void rds_ib_send_rdma_complete(struct rds_message *rm, static char *rds_ib_wc_status_strings[] = {
int wc_status) #define RDS_IB_WC_STATUS_STR(foo) \
[IB_WC_##foo] = __stringify(IB_WC_##foo)
RDS_IB_WC_STATUS_STR(SUCCESS),
RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
RDS_IB_WC_STATUS_STR(REM_OP_ERR),
RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
RDS_IB_WC_STATUS_STR(FATAL_ERR),
RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
RDS_IB_WC_STATUS_STR(GENERAL_ERR),
#undef RDS_IB_WC_STATUS_STR
};
char *rds_ib_wc_status_str(enum ib_wc_status status)
{
return rds_str_array(rds_ib_wc_status_strings,
ARRAY_SIZE(rds_ib_wc_status_strings), status);
}
/*
* Convert IB-specific error message to RDS error message and call core
* completion handler.
*/
static void rds_ib_send_complete(struct rds_message *rm,
int wc_status,
void (*complete)(struct rds_message *rm, int status))
{ {
int notify_status; int notify_status;
...@@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm, ...@@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm,
notify_status = RDS_RDMA_OTHER_ERROR; notify_status = RDS_RDMA_OTHER_ERROR;
break; break;
} }
rds_rdma_send_complete(rm, notify_status); complete(rm, notify_status);
}
static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
struct rm_data_op *op,
int wc_status)
{
if (op->op_nents)
ib_dma_unmap_sg(ic->i_cm_id->device,
op->op_sg, op->op_nents,
DMA_TO_DEVICE);
} }
static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
struct rds_rdma_op *op) struct rm_rdma_op *op,
int wc_status)
{ {
if (op->r_mapped) { if (op->op_mapped) {
ib_dma_unmap_sg(ic->i_cm_id->device, ib_dma_unmap_sg(ic->i_cm_id->device,
op->r_sg, op->r_nents, op->op_sg, op->op_nents,
op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
op->r_mapped = 0; op->op_mapped = 0;
} }
/* If the user asked for a completion notification on this
* message, we can implement three different semantics:
* 1. Notify when we received the ACK on the RDS message
* that was queued with the RDMA. This provides reliable
* notification of RDMA status at the expense of a one-way
* packet delay.
* 2. Notify when the IB stack gives us the completion event for
* the RDMA operation.
* 3. Notify when the IB stack gives us the completion event for
* the accompanying RDS messages.
* Here, we implement approach #3. To implement approach #2,
* we would need to take an event for the rdma WR. To implement #1,
* don't call rds_rdma_send_complete at all, and fall back to the notify
* handling in the ACK processing code.
*
* Note: There's no need to explicitly sync any RDMA buffers using
* ib_dma_sync_sg_for_cpu - the completion for the RDMA
* operation itself unmapped the RDMA buffers, which takes care
* of synching.
*/
rds_ib_send_complete(container_of(op, struct rds_message, rdma),
wc_status, rds_rdma_send_complete);
if (op->op_write)
rds_stats_add(s_send_rdma_bytes, op->op_bytes);
else
rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
} }
static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
struct rds_ib_send_work *send, struct rm_atomic_op *op,
int wc_status) int wc_status)
{ {
struct rds_message *rm = send->s_rm; /* unmap atomic recvbuf */
if (op->op_mapped) {
rdsdebug("ic %p send %p rm %p\n", ic, send, rm); ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
DMA_FROM_DEVICE);
ib_dma_unmap_sg(ic->i_cm_id->device, op->op_mapped = 0;
rm->m_sg, rm->m_nents, }
DMA_TO_DEVICE);
if (rm->m_rdma_op != NULL) {
rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
/* If the user asked for a completion notification on this
* message, we can implement three different semantics:
* 1. Notify when we received the ACK on the RDS message
* that was queued with the RDMA. This provides reliable
* notification of RDMA status at the expense of a one-way
* packet delay.
* 2. Notify when the IB stack gives us the completion event for
* the RDMA operation.
* 3. Notify when the IB stack gives us the completion event for
* the accompanying RDS messages.
* Here, we implement approach #3. To implement approach #2,
* call rds_rdma_send_complete from the cq_handler. To implement #1,
* don't call rds_rdma_send_complete at all, and fall back to the notify
* handling in the ACK processing code.
*
* Note: There's no need to explicitly sync any RDMA buffers using
* ib_dma_sync_sg_for_cpu - the completion for the RDMA
* operation itself unmapped the RDMA buffers, which takes care
* of synching.
*/
rds_ib_send_rdma_complete(rm, wc_status);
if (rm->m_rdma_op->r_write) rds_ib_send_complete(container_of(op, struct rds_message, atomic),
rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); wc_status, rds_atomic_send_complete);
else
rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
rds_ib_stats_inc(s_ib_atomic_cswp);
else
rds_ib_stats_inc(s_ib_atomic_fadd);
}
/*
* Unmap the resources associated with a struct send_work.
*
* Returns the rm for no good reason other than it is unobtainable
* other than by switching on wr.opcode, currently, and the caller,
* the event handler, needs it.
*/
static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
struct rds_ib_send_work *send,
int wc_status)
{
struct rds_message *rm = NULL;
/* In the error case, wc.opcode sometimes contains garbage */
switch (send->s_wr.opcode) {
case IB_WR_SEND:
if (send->s_op) {
rm = container_of(send->s_op, struct rds_message, data);
rds_ib_send_unmap_data(ic, send->s_op, wc_status);
}
break;
case IB_WR_RDMA_WRITE:
case IB_WR_RDMA_READ:
if (send->s_op) {
rm = container_of(send->s_op, struct rds_message, rdma);
rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
}
break;
case IB_WR_ATOMIC_FETCH_AND_ADD:
case IB_WR_ATOMIC_CMP_AND_SWP:
if (send->s_op) {
rm = container_of(send->s_op, struct rds_message, atomic);
rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
}
break;
default:
if (printk_ratelimit())
printk(KERN_NOTICE
"RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
__func__, send->s_wr.opcode);
break;
} }
/* If anyone waited for this message to get flushed out, wake send->s_wr.opcode = 0xdead;
* them up now */
rds_message_unmapped(rm);
rds_message_put(rm); return rm;
send->s_rm = NULL;
} }
void rds_ib_send_init_ring(struct rds_ib_connection *ic) void rds_ib_send_init_ring(struct rds_ib_connection *ic)
...@@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) ...@@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
struct ib_sge *sge; struct ib_sge *sge;
send->s_rm = NULL;
send->s_op = NULL; send->s_op = NULL;
send->s_wr.wr_id = i; send->s_wr.wr_id = i;
send->s_wr.sg_list = send->s_sge; send->s_wr.sg_list = send->s_sge;
send->s_wr.num_sge = 1;
send->s_wr.opcode = IB_WR_SEND;
send->s_wr.send_flags = 0;
send->s_wr.ex.imm_data = 0; send->s_wr.ex.imm_data = 0;
sge = rds_ib_data_sge(ic, send->s_sge); sge = &send->s_sge[0];
sge->lkey = ic->i_mr->lkey;
sge = rds_ib_header_sge(ic, send->s_sge);
sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
sge->length = sizeof(struct rds_header); sge->length = sizeof(struct rds_header);
sge->lkey = ic->i_mr->lkey; sge->lkey = ic->i_mr->lkey;
send->s_sge[1].lkey = ic->i_mr->lkey;
} }
} }
...@@ -159,15 +248,23 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) ...@@ -159,15 +248,23 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
u32 i; u32 i;
for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
if (send->s_wr.opcode == 0xdead) if (send->s_op && send->s_wr.opcode != 0xdead)
continue; rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
if (send->s_rm)
rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
if (send->s_op)
rds_ib_send_unmap_rdma(ic, send->s_op);
} }
} }
/*
* The only fast path caller always has a non-zero nr, so we don't
* bother testing nr before performing the atomic sub.
*/
static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
{
if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
waitqueue_active(&rds_ib_ring_empty_wait))
wake_up(&rds_ib_ring_empty_wait);
BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
}
/* /*
* The _oldest/_free ring operations here race cleanly with the alloc/unalloc * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
* operations performed in the send path. As the sender allocs and potentially * operations performed in the send path. As the sender allocs and potentially
...@@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) ...@@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
{ {
struct rds_connection *conn = context; struct rds_connection *conn = context;
struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_message *rm = NULL;
struct ib_wc wc; struct ib_wc wc;
struct rds_ib_send_work *send; struct rds_ib_send_work *send;
u32 completed; u32 completed;
u32 oldest; u32 oldest;
u32 i = 0; u32 i = 0;
int ret; int ret;
int nr_sig = 0;
rdsdebug("cq %p conn %p\n", cq, conn); rdsdebug("cq %p conn %p\n", cq, conn);
rds_ib_stats_inc(s_ib_tx_cq_call); rds_ib_stats_inc(s_ib_tx_cq_call);
...@@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) ...@@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
rdsdebug("ib_req_notify_cq send failed: %d\n", ret); rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
while (ib_poll_cq(cq, 1, &wc) > 0) { while (ib_poll_cq(cq, 1, &wc) > 0) {
rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
(unsigned long long)wc.wr_id, wc.status, wc.byte_len, (unsigned long long)wc.wr_id, wc.status,
rds_ib_wc_status_str(wc.status), wc.byte_len,
be32_to_cpu(wc.ex.imm_data)); be32_to_cpu(wc.ex.imm_data));
rds_ib_stats_inc(s_ib_tx_cq_event); rds_ib_stats_inc(s_ib_tx_cq_event);
...@@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) ...@@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
for (i = 0; i < completed; i++) { for (i = 0; i < completed; i++) {
send = &ic->i_sends[oldest]; send = &ic->i_sends[oldest];
if (send->s_wr.send_flags & IB_SEND_SIGNALED)
nr_sig++;
/* In the error case, wc.opcode sometimes contains garbage */ rm = rds_ib_send_unmap_op(ic, send, wc.status);
switch (send->s_wr.opcode) {
case IB_WR_SEND:
if (send->s_rm)
rds_ib_send_unmap_rm(ic, send, wc.status);
break;
case IB_WR_RDMA_WRITE:
case IB_WR_RDMA_READ:
/* Nothing to be done - the SG list will be unmapped
* when the SEND completes. */
break;
default:
if (printk_ratelimit())
printk(KERN_NOTICE
"RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
__func__, send->s_wr.opcode);
break;
}
send->s_wr.opcode = 0xdead;
send->s_wr.num_sge = 1;
if (send->s_queued + HZ/2 < jiffies) if (send->s_queued + HZ/2 < jiffies)
rds_ib_stats_inc(s_ib_tx_stalled); rds_ib_stats_inc(s_ib_tx_stalled);
/* If a RDMA operation produced an error, signal this right if (send->s_op) {
* away. If we don't, the subsequent SEND that goes with this if (send->s_op == rm->m_final_op) {
* RDMA will be canceled with ERR_WFLUSH, and the application /* If anyone waited for this message to get flushed out, wake
* never learn that the RDMA failed. */ * them up now */
if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { rds_message_unmapped(rm);
struct rds_message *rm;
rm = rds_send_get_message(conn, send->s_op);
if (rm) {
if (rm->m_rdma_op)
rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
rds_ib_send_rdma_complete(rm, wc.status);
rds_message_put(rm);
} }
rds_message_put(rm);
send->s_op = NULL;
} }
oldest = (oldest + 1) % ic->i_send_ring.w_nr; oldest = (oldest + 1) % ic->i_send_ring.w_nr;
} }
rds_ib_ring_free(&ic->i_send_ring, completed); rds_ib_ring_free(&ic->i_send_ring, completed);
rds_ib_sub_signaled(ic, nr_sig);
nr_sig = 0;
if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
test_bit(0, &conn->c_map_queued)) test_bit(0, &conn->c_map_queued))
...@@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) ...@@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
/* We expect errors as the qp is drained during shutdown */ /* We expect errors as the qp is drained during shutdown */
if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
rds_ib_conn_error(conn, rds_ib_conn_error(conn, "send completion on %pI4 had status "
"send completion on %pI4 " "%u (%s), disconnecting and reconnecting\n",
"had status %u, disconnecting and reconnecting\n", &conn->c_faddr, wc.status,
&conn->c_faddr, wc.status); rds_ib_wc_status_str(wc.status));
} }
} }
} }
...@@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) ...@@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
* credits (see rds_ib_send_add_credits below). * credits (see rds_ib_send_add_credits below).
* *
* The RDS send code is essentially single-threaded; rds_send_xmit * The RDS send code is essentially single-threaded; rds_send_xmit
* grabs c_send_lock to ensure exclusive access to the send ring. * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
* However, the ACK sending code is independent and can race with * However, the ACK sending code is independent and can race with
* message SENDs. * message SENDs.
* *
...@@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) ...@@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
} }
static inline void static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, struct rds_ib_send_work *send,
struct rds_ib_send_work *send, unsigned int pos, bool notify)
unsigned long buffer, unsigned int length,
int send_flags)
{ {
struct ib_sge *sge; /*
* We want to delay signaling completions just enough to get
WARN_ON(pos != send - ic->i_sends); * the batching benefits but not so much that we create dead time
* on the wire.
send->s_wr.send_flags = send_flags; */
send->s_wr.opcode = IB_WR_SEND; if (ic->i_unsignaled_wrs-- == 0 || notify) {
send->s_wr.num_sge = 2; ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
send->s_wr.next = NULL; send->s_wr.send_flags |= IB_SEND_SIGNALED;
send->s_queued = jiffies; return 1;
send->s_op = NULL;
if (length != 0) {
sge = rds_ib_data_sge(ic, send->s_sge);
sge->addr = buffer;
sge->length = length;
sge->lkey = ic->i_mr->lkey;
sge = rds_ib_header_sge(ic, send->s_sge);
} else {
/* We're sending a packet with no payload. There is only
* one SGE */
send->s_wr.num_sge = 1;
sge = &send->s_sge[0];
} }
return 0;
sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
sge->length = sizeof(struct rds_header);
sge->lkey = ic->i_mr->lkey;
} }
/* /*
...@@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
u32 pos; u32 pos;
u32 i; u32 i;
u32 work_alloc; u32 work_alloc;
u32 credit_alloc; u32 credit_alloc = 0;
u32 posted; u32 posted;
u32 adv_credits = 0; u32 adv_credits = 0;
int send_flags = 0; int send_flags = 0;
int sent; int bytes_sent = 0;
int ret; int ret;
int flow_controlled = 0; int flow_controlled = 0;
int nr_sig = 0;
BUG_ON(off % RDS_FRAG_SIZE); BUG_ON(off % RDS_FRAG_SIZE);
BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
...@@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
goto out; goto out;
} }
credit_alloc = work_alloc;
if (ic->i_flowctl) { if (ic->i_flowctl) {
credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
adv_credits += posted; adv_credits += posted;
if (credit_alloc < work_alloc) { if (credit_alloc < work_alloc) {
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
work_alloc = credit_alloc; work_alloc = credit_alloc;
flow_controlled++; flow_controlled = 1;
} }
if (work_alloc == 0) { if (work_alloc == 0) {
set_bit(RDS_LL_SEND_FULL, &conn->c_flags); set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
...@@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
} }
/* map the message the first time we see it */ /* map the message the first time we see it */
if (ic->i_rm == NULL) { if (!ic->i_data_op) {
/* if (rm->data.op_nents) {
printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", rm->data.op_count = ib_dma_map_sg(dev,
be16_to_cpu(rm->m_inc.i_hdr.h_dport), rm->data.op_sg,
rm->m_inc.i_hdr.h_flags, rm->data.op_nents,
be32_to_cpu(rm->m_inc.i_hdr.h_len)); DMA_TO_DEVICE);
*/ rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
if (rm->m_nents) { if (rm->data.op_count == 0) {
rm->m_count = ib_dma_map_sg(dev,
rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
if (rm->m_count == 0) {
rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
ret = -ENOMEM; /* XXX ? */ ret = -ENOMEM; /* XXX ? */
goto out; goto out;
} }
} else { } else {
rm->m_count = 0; rm->data.op_count = 0;
} }
ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
rds_message_addref(rm); rds_message_addref(rm);
ic->i_rm = rm; ic->i_data_op = &rm->data;
/* Finalize the header */ /* Finalize the header */
if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
...@@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
/* If it has a RDMA op, tell the peer we did it. This is /* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */ * used by the peer to release use-once RDMA MRs. */
if (rm->m_rdma_op) { if (rm->rdma.op_active) {
struct rds_ext_header_rdma ext_hdr; struct rds_ext_header_rdma ext_hdr;
ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
rds_message_add_extension(&rm->m_inc.i_hdr, rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
} }
...@@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
/* /*
* Update adv_credits since we reset the ACK_REQUIRED bit. * Update adv_credits since we reset the ACK_REQUIRED bit.
*/ */
rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); if (ic->i_flowctl) {
adv_credits += posted; rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
BUG_ON(adv_credits > 255); adv_credits += posted;
BUG_ON(adv_credits > 255);
}
} }
send = &ic->i_sends[pos];
first = send;
prev = NULL;
scat = &rm->m_sg[sg];
sent = 0;
i = 0;
/* Sometimes you want to put a fence between an RDMA /* Sometimes you want to put a fence between an RDMA
* READ and the following SEND. * READ and the following SEND.
* We could either do this all the time * We could either do this all the time
* or when requested by the user. Right now, we let * or when requested by the user. Right now, we let
* the application choose. * the application choose.
*/ */
if (rm->m_rdma_op && rm->m_rdma_op->r_fence) if (rm->rdma.op_active && rm->rdma.op_fence)
send_flags = IB_SEND_FENCE; send_flags = IB_SEND_FENCE;
/* /* Each frag gets a header. Msgs may be 0 bytes */
* We could be copying the header into the unused tail of the page. send = &ic->i_sends[pos];
* That would need to be changed in the future when those pages might first = send;
* be mapped userspace pages or page cache pages. So instead we always prev = NULL;
* use a second sge and our long-lived ring of mapped headers. We send scat = &ic->i_data_op->op_sg[sg];
* the header after the data so that the data payload can be aligned on i = 0;
* the receiver. do {
*/ unsigned int len = 0;
/* handle a 0-len message */ /* Set up the header */
if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { send->s_wr.send_flags = send_flags;
rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); send->s_wr.opcode = IB_WR_SEND;
goto add_header; send->s_wr.num_sge = 1;
} send->s_wr.next = NULL;
send->s_queued = jiffies;
send->s_op = NULL;
/* if there's data reference it with a chain of work reqs */ send->s_sge[0].addr = ic->i_send_hdrs_dma
for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { + (pos * sizeof(struct rds_header));
unsigned int len; send->s_sge[0].length = sizeof(struct rds_header);
send = &ic->i_sends[pos]; memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); /* Set up the data, if present */
rds_ib_xmit_populate_wr(ic, send, pos, if (i < work_alloc
ib_sg_dma_address(dev, scat) + off, len, && scat != &rm->data.op_sg[rm->data.op_count]) {
send_flags); len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
send->s_wr.num_sge = 2;
/* send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
* We want to delay signaling completions just enough to get send->s_sge[1].length = len;
* the batching benefits but not so much that we create dead time
* on the wire.
*/
if (ic->i_unsignaled_wrs-- == 0) {
ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
}
ic->i_unsignaled_bytes -= len; bytes_sent += len;
if (ic->i_unsignaled_bytes <= 0) { off += len;
ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; if (off == ib_sg_dma_len(dev, scat)) {
send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; scat++;
off = 0;
}
} }
rds_ib_set_wr_signal_state(ic, send, 0);
/* /*
* Always signal the last one if we're stopping due to flow control. * Always signal the last one if we're stopping due to flow control.
*/ */
if (flow_controlled && i == (work_alloc-1)) if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
if (send->s_wr.send_flags & IB_SEND_SIGNALED)
nr_sig++;
rdsdebug("send %p wr %p num_sge %u next %p\n", send, rdsdebug("send %p wr %p num_sge %u next %p\n", send,
&send->s_wr, send->s_wr.num_sge, send->s_wr.next); &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
sent += len; if (ic->i_flowctl && adv_credits) {
off += len;
if (off == ib_sg_dma_len(dev, scat)) {
scat++;
off = 0;
}
add_header:
/* Tack on the header after the data. The header SGE should already
* have been set up to point to the right header buffer. */
memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
if (0) {
struct rds_header *hdr = &ic->i_send_hdrs[pos];
printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
be16_to_cpu(hdr->h_dport),
hdr->h_flags,
be32_to_cpu(hdr->h_len));
}
if (adv_credits) {
struct rds_header *hdr = &ic->i_send_hdrs[pos]; struct rds_header *hdr = &ic->i_send_hdrs[pos];
/* add credit and redo the header checksum */ /* add credit and redo the header checksum */
...@@ -689,20 +721,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -689,20 +721,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
prev = send; prev = send;
pos = (pos + 1) % ic->i_send_ring.w_nr; pos = (pos + 1) % ic->i_send_ring.w_nr;
} send = &ic->i_sends[pos];
i++;
} while (i < work_alloc
&& scat != &rm->data.op_sg[rm->data.op_count]);
/* Account the RDS header in the number of bytes we sent, but just once. /* Account the RDS header in the number of bytes we sent, but just once.
* The caller has no concept of fragmentation. */ * The caller has no concept of fragmentation. */
if (hdr_off == 0) if (hdr_off == 0)
sent += sizeof(struct rds_header); bytes_sent += sizeof(struct rds_header);
/* if we finished the message then send completion owns it */ /* if we finished the message then send completion owns it */
if (scat == &rm->m_sg[rm->m_count]) { if (scat == &rm->data.op_sg[rm->data.op_count]) {
prev->s_rm = ic->i_rm; prev->s_op = ic->i_data_op;
prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; prev->s_wr.send_flags |= IB_SEND_SOLICITED;
ic->i_rm = NULL; ic->i_data_op = NULL;
} }
/* Put back wrs & credits we didn't use */
if (i < work_alloc) { if (i < work_alloc) {
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
work_alloc = i; work_alloc = i;
...@@ -710,6 +747,9 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -710,6 +747,9 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
if (ic->i_flowctl && i < credit_alloc) if (ic->i_flowctl && i < credit_alloc)
rds_ib_send_add_credits(conn, credit_alloc - i); rds_ib_send_add_credits(conn, credit_alloc - i);
if (nr_sig)
atomic_add(nr_sig, &ic->i_signaled_sends);
/* XXX need to worry about failed_wr and partial sends. */ /* XXX need to worry about failed_wr and partial sends. */
failed_wr = &first->s_wr; failed_wr = &first->s_wr;
ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
...@@ -720,32 +760,127 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -720,32 +760,127 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret); "returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
if (prev->s_rm) { rds_ib_sub_signaled(ic, nr_sig);
ic->i_rm = prev->s_rm; if (prev->s_op) {
prev->s_rm = NULL; ic->i_data_op = prev->s_op;
prev->s_op = NULL;
} }
rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
goto out; goto out;
} }
ret = sent; ret = bytes_sent;
out: out:
BUG_ON(adv_credits); BUG_ON(adv_credits);
return ret; return ret;
} }
int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) /*
* Issue atomic operation.
* A simplified version of the rdma case, we always map 1 SG, and
* only 8 bytes, for the return value from the atomic operation.
*/
int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_send_work *send = NULL;
struct ib_send_wr *failed_wr;
struct rds_ib_device *rds_ibdev;
u32 pos;
u32 work_alloc;
int ret;
int nr_sig = 0;
rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
if (work_alloc != 1) {
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_stats_inc(s_ib_tx_ring_full);
ret = -ENOMEM;
goto out;
}
/* address of send request in ring */
send = &ic->i_sends[pos];
send->s_queued = jiffies;
if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
} else { /* FADD */
send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
send->s_wr.wr.atomic.swap = 0;
send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
send->s_wr.wr.atomic.swap_mask = 0;
}
nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
send->s_wr.num_sge = 1;
send->s_wr.next = NULL;
send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
send->s_wr.wr.atomic.rkey = op->op_rkey;
send->s_op = op;
rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
/* map 8 byte retval buffer to the device */
ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
if (ret != 1) {
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
ret = -ENOMEM; /* XXX ? */
goto out;
}
/* Convert our struct scatterlist to struct ib_sge */
send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
send->s_sge[0].lkey = ic->i_mr->lkey;
rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
send->s_sge[0].addr, send->s_sge[0].length);
if (nr_sig)
atomic_add(nr_sig, &ic->i_signaled_sends);
failed_wr = &send->s_wr;
ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
send, &send->s_wr, ret, failed_wr);
BUG_ON(failed_wr != &send->s_wr);
if (ret) {
printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_sub_signaled(ic, nr_sig);
goto out;
}
if (unlikely(failed_wr != &send->s_wr)) {
printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
BUG_ON(failed_wr != &send->s_wr);
}
out:
return ret;
}
int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
{ {
struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_send_work *send = NULL; struct rds_ib_send_work *send = NULL;
struct rds_ib_send_work *first; struct rds_ib_send_work *first;
struct rds_ib_send_work *prev; struct rds_ib_send_work *prev;
struct ib_send_wr *failed_wr; struct ib_send_wr *failed_wr;
struct rds_ib_device *rds_ibdev;
struct scatterlist *scat; struct scatterlist *scat;
unsigned long len; unsigned long len;
u64 remote_addr = op->r_remote_addr; u64 remote_addr = op->op_remote_addr;
u32 max_sge = ic->rds_ibdev->max_sge;
u32 pos; u32 pos;
u32 work_alloc; u32 work_alloc;
u32 i; u32 i;
...@@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
int sent; int sent;
int ret; int ret;
int num_sge; int num_sge;
int nr_sig = 0;
rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
/* map the op the first time we see it */
/* map the message the first time we see it */ if (!op->op_mapped) {
if (!op->r_mapped) { op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
op->r_count = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, op->op_nents, (op->op_write) ?
op->r_sg, op->r_nents, (op->r_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
DMA_TO_DEVICE : DMA_FROM_DEVICE); rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); if (op->op_count == 0) {
if (op->r_count == 0) {
rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
ret = -ENOMEM; /* XXX ? */ ret = -ENOMEM; /* XXX ? */
goto out; goto out;
} }
op->r_mapped = 1; op->op_mapped = 1;
} }
/* /*
* Instead of knowing how to return a partial rdma read/write we insist that there * Instead of knowing how to return a partial rdma read/write we insist that there
* be enough work requests to send the entire message. * be enough work requests to send the entire message.
*/ */
i = ceil(op->r_count, rds_ibdev->max_sge); i = ceil(op->op_count, max_sge);
work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
if (work_alloc != i) { if (work_alloc != i) {
...@@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
send = &ic->i_sends[pos]; send = &ic->i_sends[pos];
first = send; first = send;
prev = NULL; prev = NULL;
scat = &op->r_sg[0]; scat = &op->op_sg[0];
sent = 0; sent = 0;
num_sge = op->r_count; num_sge = op->op_count;
for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
send->s_wr.send_flags = 0; send->s_wr.send_flags = 0;
send->s_queued = jiffies; send->s_queued = jiffies;
/* send->s_op = NULL;
* We want to delay signaling completions just enough to get
* the batching benefits but not so much that we create dead time on the wire. nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
*/
if (ic->i_unsignaled_wrs-- == 0) {
ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
send->s_wr.send_flags = IB_SEND_SIGNALED;
}
send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
send->s_wr.wr.rdma.remote_addr = remote_addr; send->s_wr.wr.rdma.remote_addr = remote_addr;
send->s_wr.wr.rdma.rkey = op->r_key; send->s_wr.wr.rdma.rkey = op->op_rkey;
send->s_op = op;
if (num_sge > rds_ibdev->max_sge) { if (num_sge > max_sge) {
send->s_wr.num_sge = rds_ibdev->max_sge; send->s_wr.num_sge = max_sge;
num_sge -= rds_ibdev->max_sge; num_sge -= max_sge;
} else { } else {
send->s_wr.num_sge = num_sge; send->s_wr.num_sge = num_sge;
} }
...@@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
if (prev) if (prev)
prev->s_wr.next = &send->s_wr; prev->s_wr.next = &send->s_wr;
for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
len = ib_sg_dma_len(ic->i_cm_id->device, scat); len = ib_sg_dma_len(ic->i_cm_id->device, scat);
send->s_sge[j].addr = send->s_sge[j].addr =
ib_sg_dma_address(ic->i_cm_id->device, scat); ib_sg_dma_address(ic->i_cm_id->device, scat);
...@@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
send = ic->i_sends; send = ic->i_sends;
} }
/* if we finished the message then send completion owns it */ /* give a reference to the last op */
if (scat == &op->r_sg[op->r_count]) if (scat == &op->op_sg[op->op_count]) {
prev->s_wr.send_flags = IB_SEND_SIGNALED; prev->s_op = op;
rds_message_addref(container_of(op, struct rds_message, rdma));
}
if (i < work_alloc) { if (i < work_alloc) {
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
work_alloc = i; work_alloc = i;
} }
if (nr_sig)
atomic_add(nr_sig, &ic->i_signaled_sends);
failed_wr = &first->s_wr; failed_wr = &first->s_wr;
ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
...@@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret); "returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_sub_signaled(ic, nr_sig);
goto out; goto out;
} }
......
...@@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = { ...@@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = {
"ib_rdma_mr_pool_flush", "ib_rdma_mr_pool_flush",
"ib_rdma_mr_pool_wait", "ib_rdma_mr_pool_wait",
"ib_rdma_mr_pool_depleted", "ib_rdma_mr_pool_depleted",
"ib_atomic_cswp",
"ib_atomic_fadd",
}; };
unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
......
...@@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16; ...@@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
/* /*
* This sysctl does nothing. * This sysctl does nothing.
* *
...@@ -93,15 +89,6 @@ ctl_table rds_ib_sysctl_table[] = { ...@@ -93,15 +89,6 @@ ctl_table rds_ib_sysctl_table[] = {
.extra1 = &rds_ib_sysctl_max_unsig_wr_min, .extra1 = &rds_ib_sysctl_max_unsig_wr_min,
.extra2 = &rds_ib_sysctl_max_unsig_wr_max, .extra2 = &rds_ib_sysctl_max_unsig_wr_max,
}, },
{
.procname = "max_unsignaled_bytes",
.data = &rds_ib_sysctl_max_unsig_bytes,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_ib_sysctl_max_unsig_bytes_min,
.extra2 = &rds_ib_sysctl_max_unsig_bytes_max,
},
{ {
.procname = "max_recv_allocation", .procname = "max_recv_allocation",
.data = &rds_ib_sysctl_max_recv_allocation, .data = &rds_ib_sysctl_max_recv_allocation,
...@@ -132,10 +119,10 @@ void rds_ib_sysctl_exit(void) ...@@ -132,10 +119,10 @@ void rds_ib_sysctl_exit(void)
unregister_sysctl_table(rds_ib_sysctl_hdr); unregister_sysctl_table(rds_ib_sysctl_hdr);
} }
int __init rds_ib_sysctl_init(void) int rds_ib_sysctl_init(void)
{ {
rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
if (rds_ib_sysctl_hdr == NULL) if (!rds_ib_sysctl_hdr)
return -ENOMEM; return -ENOMEM;
return 0; return 0;
} }
...@@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func) ...@@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func)
BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
spin_lock(&rds_info_lock); spin_lock(&rds_info_lock);
BUG_ON(rds_info_funcs[offset] != NULL); BUG_ON(rds_info_funcs[offset]);
rds_info_funcs[offset] = func; rds_info_funcs[offset] = func;
spin_unlock(&rds_info_lock); spin_unlock(&rds_info_lock);
} }
...@@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func); ...@@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func);
*/ */
void rds_info_iter_unmap(struct rds_info_iterator *iter) void rds_info_iter_unmap(struct rds_info_iterator *iter)
{ {
if (iter->addr != NULL) { if (iter->addr) {
kunmap_atomic(iter->addr, KM_USER0); kunmap_atomic(iter->addr, KM_USER0);
iter->addr = NULL; iter->addr = NULL;
} }
...@@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data, ...@@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
unsigned long this; unsigned long this;
while (bytes) { while (bytes) {
if (iter->addr == NULL) if (!iter->addr)
iter->addr = kmap_atomic(*iter->pages, KM_USER0); iter->addr = kmap_atomic(*iter->pages, KM_USER0);
this = min(bytes, PAGE_SIZE - iter->offset); this = min(bytes, PAGE_SIZE - iter->offset);
...@@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, ...@@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
>> PAGE_SHIFT; >> PAGE_SHIFT;
pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
if (pages == NULL) { if (!pages) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
...@@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, ...@@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
call_func: call_func:
func = rds_info_funcs[optname - RDS_INFO_FIRST]; func = rds_info_funcs[optname - RDS_INFO_FIRST];
if (func == NULL) { if (!func) {
ret = -ENOPROTOOPT; ret = -ENOPROTOOPT;
goto out; goto out;
} }
...@@ -234,7 +234,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, ...@@ -234,7 +234,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
ret = -EFAULT; ret = -EFAULT;
out: out:
for (i = 0; pages != NULL && i < nr_pages; i++) for (i = 0; pages && i < nr_pages; i++)
put_page(pages[i]); put_page(pages[i]);
kfree(pages); kfree(pages);
......
...@@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = { ...@@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = {
.laddr_check = rds_iw_laddr_check, .laddr_check = rds_iw_laddr_check,
.xmit_complete = rds_iw_xmit_complete, .xmit_complete = rds_iw_xmit_complete,
.xmit = rds_iw_xmit, .xmit = rds_iw_xmit,
.xmit_cong_map = NULL,
.xmit_rdma = rds_iw_xmit_rdma, .xmit_rdma = rds_iw_xmit_rdma,
.recv = rds_iw_recv, .recv = rds_iw_recv,
.conn_alloc = rds_iw_conn_alloc, .conn_alloc = rds_iw_conn_alloc,
...@@ -272,7 +271,6 @@ struct rds_transport rds_iw_transport = { ...@@ -272,7 +271,6 @@ struct rds_transport rds_iw_transport = {
.conn_connect = rds_iw_conn_connect, .conn_connect = rds_iw_conn_connect,
.conn_shutdown = rds_iw_conn_shutdown, .conn_shutdown = rds_iw_conn_shutdown,
.inc_copy_to_user = rds_iw_inc_copy_to_user, .inc_copy_to_user = rds_iw_inc_copy_to_user,
.inc_purge = rds_iw_inc_purge,
.inc_free = rds_iw_inc_free, .inc_free = rds_iw_inc_free,
.cm_initiate_connect = rds_iw_cm_initiate_connect, .cm_initiate_connect = rds_iw_cm_initiate_connect,
.cm_handle_connect = rds_iw_cm_handle_connect, .cm_handle_connect = rds_iw_cm_handle_connect,
...@@ -289,7 +287,7 @@ struct rds_transport rds_iw_transport = { ...@@ -289,7 +287,7 @@ struct rds_transport rds_iw_transport = {
.t_prefer_loopback = 1, .t_prefer_loopback = 1,
}; };
int __init rds_iw_init(void) int rds_iw_init(void)
{ {
int ret; int ret;
......
...@@ -70,7 +70,7 @@ struct rds_iw_send_work { ...@@ -70,7 +70,7 @@ struct rds_iw_send_work {
struct rds_message *s_rm; struct rds_message *s_rm;
/* We should really put these into a union: */ /* We should really put these into a union: */
struct rds_rdma_op *s_op; struct rm_rdma_op *s_op;
struct rds_iw_mapping *s_mapping; struct rds_iw_mapping *s_mapping;
struct ib_mr *s_mr; struct ib_mr *s_mr;
struct ib_fast_reg_page_list *s_page_list; struct ib_fast_reg_page_list *s_page_list;
...@@ -284,7 +284,7 @@ void rds_iw_conn_free(void *arg); ...@@ -284,7 +284,7 @@ void rds_iw_conn_free(void *arg);
int rds_iw_conn_connect(struct rds_connection *conn); int rds_iw_conn_connect(struct rds_connection *conn);
void rds_iw_conn_shutdown(struct rds_connection *conn); void rds_iw_conn_shutdown(struct rds_connection *conn);
void rds_iw_state_change(struct sock *sk); void rds_iw_state_change(struct sock *sk);
int __init rds_iw_listen_init(void); int rds_iw_listen_init(void);
void rds_iw_listen_stop(void); void rds_iw_listen_stop(void);
void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
...@@ -321,12 +321,11 @@ void rds_iw_flush_mrs(void); ...@@ -321,12 +321,11 @@ void rds_iw_flush_mrs(void);
void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
/* ib_recv.c */ /* ib_recv.c */
int __init rds_iw_recv_init(void); int rds_iw_recv_init(void);
void rds_iw_recv_exit(void); void rds_iw_recv_exit(void);
int rds_iw_recv(struct rds_connection *conn); int rds_iw_recv(struct rds_connection *conn);
int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
gfp_t page_gfp, int prefill); gfp_t page_gfp, int prefill);
void rds_iw_inc_purge(struct rds_incoming *inc);
void rds_iw_inc_free(struct rds_incoming *inc); void rds_iw_inc_free(struct rds_incoming *inc);
int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
size_t size); size_t size);
...@@ -358,7 +357,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -358,7 +357,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
void rds_iw_send_init_ring(struct rds_iw_connection *ic); void rds_iw_send_init_ring(struct rds_iw_connection *ic);
void rds_iw_send_clear_ring(struct rds_iw_connection *ic); void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
...@@ -371,7 +370,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, ...@@ -371,7 +370,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail); unsigned int avail);
/* ib_sysctl.c */ /* ib_sysctl.c */
int __init rds_iw_sysctl_init(void); int rds_iw_sysctl_init(void);
void rds_iw_sysctl_exit(void); void rds_iw_sysctl_exit(void);
extern unsigned long rds_iw_sysctl_max_send_wr; extern unsigned long rds_iw_sysctl_max_send_wr;
extern unsigned long rds_iw_sysctl_max_recv_wr; extern unsigned long rds_iw_sysctl_max_recv_wr;
......
...@@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ...@@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
* the rds_iwdev at all. * the rds_iwdev at all.
*/ */
rds_iwdev = ib_get_client_data(dev, &rds_iw_client); rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
if (rds_iwdev == NULL) { if (!rds_iwdev) {
if (printk_ratelimit()) if (printk_ratelimit())
printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
dev->name); dev->name);
...@@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ...@@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
ic->i_send_ring.w_nr * ic->i_send_ring.w_nr *
sizeof(struct rds_header), sizeof(struct rds_header),
&ic->i_send_hdrs_dma, GFP_KERNEL); &ic->i_send_hdrs_dma, GFP_KERNEL);
if (ic->i_send_hdrs == NULL) { if (!ic->i_send_hdrs) {
ret = -ENOMEM; ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent send failed\n"); rdsdebug("ib_dma_alloc_coherent send failed\n");
goto out; goto out;
...@@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ...@@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
ic->i_recv_ring.w_nr * ic->i_recv_ring.w_nr *
sizeof(struct rds_header), sizeof(struct rds_header),
&ic->i_recv_hdrs_dma, GFP_KERNEL); &ic->i_recv_hdrs_dma, GFP_KERNEL);
if (ic->i_recv_hdrs == NULL) { if (!ic->i_recv_hdrs) {
ret = -ENOMEM; ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent recv failed\n"); rdsdebug("ib_dma_alloc_coherent recv failed\n");
goto out; goto out;
...@@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ...@@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
&ic->i_ack_dma, GFP_KERNEL); &ic->i_ack_dma, GFP_KERNEL);
if (ic->i_ack == NULL) { if (!ic->i_ack) {
ret = -ENOMEM; ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent ack failed\n"); rdsdebug("ib_dma_alloc_coherent ack failed\n");
goto out; goto out;
} }
ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
if (ic->i_sends == NULL) { if (!ic->i_sends) {
ret = -ENOMEM; ret = -ENOMEM;
rdsdebug("send allocation failed\n"); rdsdebug("send allocation failed\n");
goto out; goto out;
...@@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ...@@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
rds_iw_send_init_ring(ic); rds_iw_send_init_ring(ic);
ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
if (ic->i_recvs == NULL) { if (!ic->i_recvs) {
ret = -ENOMEM; ret = -ENOMEM;
rdsdebug("recv allocation failed\n"); rdsdebug("recv allocation failed\n");
goto out; goto out;
...@@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) ...@@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
/* XXX too lazy? */ /* XXX too lazy? */
ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
if (ic == NULL) if (!ic)
return -ENOMEM; return -ENOMEM;
INIT_LIST_HEAD(&ic->iw_node); INIT_LIST_HEAD(&ic->iw_node);
......
...@@ -34,7 +34,6 @@ ...@@ -34,7 +34,6 @@
#include <linux/slab.h> #include <linux/slab.h>
#include "rds.h" #include "rds.h"
#include "rdma.h"
#include "iw.h" #include "iw.h"
......
...@@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag) ...@@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
static void rds_iw_frag_free(struct rds_page_frag *frag) static void rds_iw_frag_free(struct rds_page_frag *frag)
{ {
rdsdebug("frag %p page %p\n", frag, frag->f_page); rdsdebug("frag %p page %p\n", frag, frag->f_page);
BUG_ON(frag->f_page != NULL); BUG_ON(frag->f_page);
kmem_cache_free(rds_iw_frag_slab, frag); kmem_cache_free(rds_iw_frag_slab, frag);
} }
...@@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, ...@@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
struct ib_sge *sge; struct ib_sge *sge;
int ret = -ENOMEM; int ret = -ENOMEM;
if (recv->r_iwinc == NULL) { if (!recv->r_iwinc) {
if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
rds_iw_stats_inc(s_iw_rx_alloc_limit); rds_iw_stats_inc(s_iw_rx_alloc_limit);
goto out; goto out;
} }
recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
kptr_gfp); kptr_gfp);
if (recv->r_iwinc == NULL) { if (!recv->r_iwinc) {
atomic_dec(&rds_iw_allocation); atomic_dec(&rds_iw_allocation);
goto out; goto out;
} }
...@@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, ...@@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
} }
if (recv->r_frag == NULL) { if (!recv->r_frag) {
recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
if (recv->r_frag == NULL) if (!recv->r_frag)
goto out; goto out;
INIT_LIST_HEAD(&recv->r_frag->f_item); INIT_LIST_HEAD(&recv->r_frag->f_item);
recv->r_frag->f_page = NULL; recv->r_frag->f_page = NULL;
} }
if (ic->i_frag.f_page == NULL) { if (!ic->i_frag.f_page) {
ic->i_frag.f_page = alloc_page(page_gfp); ic->i_frag.f_page = alloc_page(page_gfp);
if (ic->i_frag.f_page == NULL) if (!ic->i_frag.f_page)
goto out; goto out;
ic->i_frag.f_offset = 0; ic->i_frag.f_offset = 0;
} }
...@@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, ...@@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
return ret; return ret;
} }
void rds_iw_inc_purge(struct rds_incoming *inc) static void rds_iw_inc_purge(struct rds_incoming *inc)
{ {
struct rds_iw_incoming *iwinc; struct rds_iw_incoming *iwinc;
struct rds_page_frag *frag; struct rds_page_frag *frag;
...@@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn, ...@@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
* into the inc and save the inc so we can hang upcoming fragments * into the inc and save the inc so we can hang upcoming fragments
* off its list. * off its list.
*/ */
if (iwinc == NULL) { if (!iwinc) {
iwinc = recv->r_iwinc; iwinc = recv->r_iwinc;
recv->r_iwinc = NULL; recv->r_iwinc = NULL;
ic->i_iwinc = iwinc; ic->i_iwinc = iwinc;
...@@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn) ...@@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn)
return ret; return ret;
} }
int __init rds_iw_recv_init(void) int rds_iw_recv_init(void)
{ {
struct sysinfo si; struct sysinfo si;
int ret = -ENOMEM; int ret = -ENOMEM;
...@@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void) ...@@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void)
rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
sizeof(struct rds_iw_incoming), sizeof(struct rds_iw_incoming),
0, 0, NULL); 0, 0, NULL);
if (rds_iw_incoming_slab == NULL) if (!rds_iw_incoming_slab)
goto out; goto out;
rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
sizeof(struct rds_page_frag), sizeof(struct rds_page_frag),
0, 0, NULL); 0, 0, NULL);
if (rds_iw_frag_slab == NULL) if (!rds_iw_frag_slab)
kmem_cache_destroy(rds_iw_incoming_slab); kmem_cache_destroy(rds_iw_incoming_slab);
else else
ret = 0; ret = 0;
......
...@@ -36,7 +36,6 @@ ...@@ -36,7 +36,6 @@
#include <linux/dmapool.h> #include <linux/dmapool.h>
#include "rds.h" #include "rds.h"
#include "rdma.h"
#include "iw.h" #include "iw.h"
static void rds_iw_send_rdma_complete(struct rds_message *rm, static void rds_iw_send_rdma_complete(struct rds_message *rm,
...@@ -64,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm, ...@@ -64,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm,
} }
static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
struct rds_rdma_op *op) struct rm_rdma_op *op)
{ {
if (op->r_mapped) { if (op->op_mapped) {
ib_dma_unmap_sg(ic->i_cm_id->device, ib_dma_unmap_sg(ic->i_cm_id->device,
op->r_sg, op->r_nents, op->op_sg, op->op_nents,
op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
op->r_mapped = 0; op->op_mapped = 0;
} }
} }
...@@ -83,11 +82,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, ...@@ -83,11 +82,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
rdsdebug("ic %p send %p rm %p\n", ic, send, rm); rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
ib_dma_unmap_sg(ic->i_cm_id->device, ib_dma_unmap_sg(ic->i_cm_id->device,
rm->m_sg, rm->m_nents, rm->data.op_sg, rm->data.op_nents,
DMA_TO_DEVICE); DMA_TO_DEVICE);
if (rm->m_rdma_op != NULL) { if (rm->rdma.op_active) {
rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); rds_iw_send_unmap_rdma(ic, &rm->rdma);
/* If the user asked for a completion notification on this /* If the user asked for a completion notification on this
* message, we can implement three different semantics: * message, we can implement three different semantics:
...@@ -111,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, ...@@ -111,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
*/ */
rds_iw_send_rdma_complete(rm, wc_status); rds_iw_send_rdma_complete(rm, wc_status);
if (rm->m_rdma_op->r_write) if (rm->rdma.op_write)
rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
else else
rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
} }
/* If anyone waited for this message to get flushed out, wake /* If anyone waited for this message to get flushed out, wake
...@@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
} }
/* map the message the first time we see it */ /* map the message the first time we see it */
if (ic->i_rm == NULL) { if (!ic->i_rm) {
/* /*
printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
be16_to_cpu(rm->m_inc.i_hdr.h_dport), be16_to_cpu(rm->m_inc.i_hdr.h_dport),
rm->m_inc.i_hdr.h_flags, rm->m_inc.i_hdr.h_flags,
be32_to_cpu(rm->m_inc.i_hdr.h_len)); be32_to_cpu(rm->m_inc.i_hdr.h_len));
*/ */
if (rm->m_nents) { if (rm->data.op_nents) {
rm->m_count = ib_dma_map_sg(dev, rm->data.op_count = ib_dma_map_sg(dev,
rm->m_sg, rm->m_nents, DMA_TO_DEVICE); rm->data.op_sg,
rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); rm->data.op_nents,
if (rm->m_count == 0) { DMA_TO_DEVICE);
rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
if (rm->data.op_count == 0) {
rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
ret = -ENOMEM; /* XXX ? */ ret = -ENOMEM; /* XXX ? */
goto out; goto out;
} }
} else { } else {
rm->m_count = 0; rm->data.op_count = 0;
} }
ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
...@@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
/* If it has a RDMA op, tell the peer we did it. This is /* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */ * used by the peer to release use-once RDMA MRs. */
if (rm->m_rdma_op) { if (rm->rdma.op_active) {
struct rds_ext_header_rdma ext_hdr; struct rds_ext_header_rdma ext_hdr;
ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
rds_message_add_extension(&rm->m_inc.i_hdr, rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
} }
...@@ -621,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -621,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
send = &ic->i_sends[pos]; send = &ic->i_sends[pos];
first = send; first = send;
prev = NULL; prev = NULL;
scat = &rm->m_sg[sg]; scat = &rm->data.op_sg[sg];
sent = 0; sent = 0;
i = 0; i = 0;
...@@ -631,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -631,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
* or when requested by the user. Right now, we let * or when requested by the user. Right now, we let
* the application choose. * the application choose.
*/ */
if (rm->m_rdma_op && rm->m_rdma_op->r_fence) if (rm->rdma.op_active && rm->rdma.op_fence)
send_flags = IB_SEND_FENCE; send_flags = IB_SEND_FENCE;
/* /*
...@@ -650,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -650,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
} }
/* if there's data reference it with a chain of work reqs */ /* if there's data reference it with a chain of work reqs */
for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
unsigned int len; unsigned int len;
send = &ic->i_sends[pos]; send = &ic->i_sends[pos];
...@@ -728,7 +729,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -728,7 +729,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
sent += sizeof(struct rds_header); sent += sizeof(struct rds_header);
/* if we finished the message then send completion owns it */ /* if we finished the message then send completion owns it */
if (scat == &rm->m_sg[rm->m_count]) { if (scat == &rm->data.op_sg[rm->data.op_count]) {
prev->s_rm = ic->i_rm; prev->s_rm = ic->i_rm;
prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
ic->i_rm = NULL; ic->i_rm = NULL;
...@@ -784,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd ...@@ -784,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd
ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
} }
int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
{ {
struct rds_iw_connection *ic = conn->c_transport_data; struct rds_iw_connection *ic = conn->c_transport_data;
struct rds_iw_send_work *send = NULL; struct rds_iw_send_work *send = NULL;
...@@ -794,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -794,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
struct rds_iw_device *rds_iwdev; struct rds_iw_device *rds_iwdev;
struct scatterlist *scat; struct scatterlist *scat;
unsigned long len; unsigned long len;
u64 remote_addr = op->r_remote_addr; u64 remote_addr = op->op_remote_addr;
u32 pos, fr_pos; u32 pos, fr_pos;
u32 work_alloc; u32 work_alloc;
u32 i; u32 i;
...@@ -806,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -806,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
/* map the message the first time we see it */ /* map the message the first time we see it */
if (!op->r_mapped) { if (!op->op_mapped) {
op->r_count = ib_dma_map_sg(ic->i_cm_id->device, op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
op->r_sg, op->r_nents, (op->r_write) ? op->op_sg, op->op_nents, (op->op_write) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE); DMA_TO_DEVICE : DMA_FROM_DEVICE);
rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
if (op->r_count == 0) { if (op->op_count == 0) {
rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
ret = -ENOMEM; /* XXX ? */ ret = -ENOMEM; /* XXX ? */
goto out; goto out;
} }
op->r_mapped = 1; op->op_mapped = 1;
} }
if (!op->r_write) { if (!op->op_write) {
/* Alloc space on the send queue for the fastreg */ /* Alloc space on the send queue for the fastreg */
work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
if (work_alloc != 1) { if (work_alloc != 1) {
...@@ -835,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -835,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
* Instead of knowing how to return a partial rdma read/write we insist that there * Instead of knowing how to return a partial rdma read/write we insist that there
* be enough work requests to send the entire message. * be enough work requests to send the entire message.
*/ */
i = ceil(op->r_count, rds_iwdev->max_sge); i = ceil(op->op_count, rds_iwdev->max_sge);
work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
if (work_alloc != i) { if (work_alloc != i) {
...@@ -846,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -846,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
} }
send = &ic->i_sends[pos]; send = &ic->i_sends[pos];
if (!op->r_write) { if (!op->op_write) {
first = prev = &ic->i_sends[fr_pos]; first = prev = &ic->i_sends[fr_pos];
} else { } else {
first = send; first = send;
prev = NULL; prev = NULL;
} }
scat = &op->r_sg[0]; scat = &op->op_sg[0];
sent = 0; sent = 0;
num_sge = op->r_count; num_sge = op->op_count;
for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
send->s_wr.send_flags = 0; send->s_wr.send_flags = 0;
send->s_queued = jiffies; send->s_queued = jiffies;
...@@ -873,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -873,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
* for local access after RDS is finished with it, using * for local access after RDS is finished with it, using
* IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
*/ */
if (op->r_write) if (op->op_write)
send->s_wr.opcode = IB_WR_RDMA_WRITE; send->s_wr.opcode = IB_WR_RDMA_WRITE;
else else
send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
send->s_wr.wr.rdma.remote_addr = remote_addr; send->s_wr.wr.rdma.remote_addr = remote_addr;
send->s_wr.wr.rdma.rkey = op->r_key; send->s_wr.wr.rdma.rkey = op->op_rkey;
send->s_op = op; send->s_op = op;
if (num_sge > rds_iwdev->max_sge) { if (num_sge > rds_iwdev->max_sge) {
...@@ -893,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -893,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
if (prev) if (prev)
prev->s_wr.next = &send->s_wr; prev->s_wr.next = &send->s_wr;
for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
len = ib_sg_dma_len(ic->i_cm_id->device, scat); len = ib_sg_dma_len(ic->i_cm_id->device, scat);
if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
...@@ -927,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -927,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
} }
/* if we finished the message then send completion owns it */ /* if we finished the message then send completion owns it */
if (scat == &op->r_sg[op->r_count]) if (scat == &op->op_sg[op->op_count])
first->s_wr.send_flags = IB_SEND_SIGNALED; first->s_wr.send_flags = IB_SEND_SIGNALED;
if (i < work_alloc) { if (i < work_alloc) {
...@@ -941,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) ...@@ -941,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
* adapters do not allow using the lkey for this at all. To bypass this use a * adapters do not allow using the lkey for this at all. To bypass this use a
* fastreg_mr (or possibly a dma_mr) * fastreg_mr (or possibly a dma_mr)
*/ */
if (!op->r_write) { if (!op->op_write) {
rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
work_alloc++; work_alloc++;
} }
......
...@@ -122,10 +122,10 @@ void rds_iw_sysctl_exit(void) ...@@ -122,10 +122,10 @@ void rds_iw_sysctl_exit(void)
unregister_sysctl_table(rds_iw_sysctl_hdr); unregister_sysctl_table(rds_iw_sysctl_hdr);
} }
int __init rds_iw_sysctl_init(void) int rds_iw_sysctl_init(void)
{ {
rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
if (rds_iw_sysctl_hdr == NULL) if (!rds_iw_sysctl_hdr)
return -ENOMEM; return -ENOMEM;
return 0; return 0;
} }
...@@ -61,10 +61,17 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -61,10 +61,17 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int hdr_off, unsigned int sg,
unsigned int off) unsigned int off)
{ {
/* Do not send cong updates to loopback */
if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
}
BUG_ON(hdr_off || sg || off); BUG_ON(hdr_off || sg || off);
rds_inc_init(&rm->m_inc, conn, conn->c_laddr); rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
rds_message_addref(rm); /* for the inc */ /* For the embedded inc. Matching put is in loop_inc_free() */
rds_message_addref(rm);
rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
GFP_KERNEL, KM_USER0); GFP_KERNEL, KM_USER0);
...@@ -77,16 +84,14 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -77,16 +84,14 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
} }
static int rds_loop_xmit_cong_map(struct rds_connection *conn, /*
struct rds_cong_map *map, * See rds_loop_xmit(). Since our inc is embedded in the rm, we
unsigned long offset) * make sure the rm lives at least until the inc is done.
*/
static void rds_loop_inc_free(struct rds_incoming *inc)
{ {
BUG_ON(offset); struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
BUG_ON(map != conn->c_lcong); rds_message_put(rm);
rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
} }
/* we need to at least give the thread something to succeed */ /* we need to at least give the thread something to succeed */
...@@ -112,7 +117,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) ...@@ -112,7 +117,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
unsigned long flags; unsigned long flags;
lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
if (lc == NULL) if (!lc)
return -ENOMEM; return -ENOMEM;
INIT_LIST_HEAD(&lc->loop_node); INIT_LIST_HEAD(&lc->loop_node);
...@@ -169,14 +174,12 @@ void rds_loop_exit(void) ...@@ -169,14 +174,12 @@ void rds_loop_exit(void)
*/ */
struct rds_transport rds_loop_transport = { struct rds_transport rds_loop_transport = {
.xmit = rds_loop_xmit, .xmit = rds_loop_xmit,
.xmit_cong_map = rds_loop_xmit_cong_map,
.recv = rds_loop_recv, .recv = rds_loop_recv,
.conn_alloc = rds_loop_conn_alloc, .conn_alloc = rds_loop_conn_alloc,
.conn_free = rds_loop_conn_free, .conn_free = rds_loop_conn_free,
.conn_connect = rds_loop_conn_connect, .conn_connect = rds_loop_conn_connect,
.conn_shutdown = rds_loop_conn_shutdown, .conn_shutdown = rds_loop_conn_shutdown,
.inc_copy_to_user = rds_message_inc_copy_to_user, .inc_copy_to_user = rds_message_inc_copy_to_user,
.inc_purge = rds_message_inc_purge, .inc_free = rds_loop_inc_free,
.inc_free = rds_message_inc_free,
.t_name = "loopback", .t_name = "loopback",
}; };
...@@ -34,9 +34,6 @@ ...@@ -34,9 +34,6 @@
#include <linux/slab.h> #include <linux/slab.h>
#include "rds.h" #include "rds.h"
#include "rdma.h"
static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_NONE] = 0, [RDS_EXTHDR_NONE] = 0,
...@@ -63,29 +60,31 @@ static void rds_message_purge(struct rds_message *rm) ...@@ -63,29 +60,31 @@ static void rds_message_purge(struct rds_message *rm)
if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
return; return;
for (i = 0; i < rm->m_nents; i++) { for (i = 0; i < rm->data.op_nents; i++) {
rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
/* XXX will have to put_page for page refs */ /* XXX will have to put_page for page refs */
__free_page(sg_page(&rm->m_sg[i])); __free_page(sg_page(&rm->data.op_sg[i]));
} }
rm->m_nents = 0; rm->data.op_nents = 0;
if (rm->m_rdma_op) if (rm->rdma.op_active)
rds_rdma_free_op(rm->m_rdma_op); rds_rdma_free_op(&rm->rdma);
if (rm->m_rdma_mr) if (rm->rdma.op_rdma_mr)
rds_mr_put(rm->m_rdma_mr); rds_mr_put(rm->rdma.op_rdma_mr);
}
void rds_message_inc_purge(struct rds_incoming *inc) if (rm->atomic.op_active)
{ rds_atomic_free_op(&rm->atomic);
struct rds_message *rm = container_of(inc, struct rds_message, m_inc); if (rm->atomic.op_rdma_mr)
rds_message_purge(rm); rds_mr_put(rm->atomic.op_rdma_mr);
} }
void rds_message_put(struct rds_message *rm) void rds_message_put(struct rds_message *rm)
{ {
rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
if (atomic_read(&rm->m_refcount) == 0) {
printk(KERN_CRIT "danger refcount zero on %p\n", rm);
WARN_ON(1);
}
if (atomic_dec_and_test(&rm->m_refcount)) { if (atomic_dec_and_test(&rm->m_refcount)) {
BUG_ON(!list_empty(&rm->m_sock_item)); BUG_ON(!list_empty(&rm->m_sock_item));
BUG_ON(!list_empty(&rm->m_conn_item)); BUG_ON(!list_empty(&rm->m_conn_item));
...@@ -96,12 +95,6 @@ void rds_message_put(struct rds_message *rm) ...@@ -96,12 +95,6 @@ void rds_message_put(struct rds_message *rm)
} }
EXPORT_SYMBOL_GPL(rds_message_put); EXPORT_SYMBOL_GPL(rds_message_put);
void rds_message_inc_free(struct rds_incoming *inc)
{
struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
rds_message_put(rm);
}
void rds_message_populate_header(struct rds_header *hdr, __be16 sport, void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq) __be16 dport, u64 seq)
{ {
...@@ -214,41 +207,68 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o ...@@ -214,41 +207,68 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
} }
EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) /*
* Each rds_message is allocated with extra space for the scatterlist entries
* rds ops will need. This is to minimize memory allocation count. Then, each rds op
* can grab SGs when initializing its part of the rds_message.
*/
struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
{ {
struct rds_message *rm; struct rds_message *rm;
rm = kzalloc(sizeof(struct rds_message) + rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
(nents * sizeof(struct scatterlist)), gfp);
if (!rm) if (!rm)
goto out; goto out;
if (nents) rm->m_used_sgs = 0;
sg_init_table(rm->m_sg, nents); rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
atomic_set(&rm->m_refcount, 1); atomic_set(&rm->m_refcount, 1);
INIT_LIST_HEAD(&rm->m_sock_item); INIT_LIST_HEAD(&rm->m_sock_item);
INIT_LIST_HEAD(&rm->m_conn_item); INIT_LIST_HEAD(&rm->m_conn_item);
spin_lock_init(&rm->m_rs_lock); spin_lock_init(&rm->m_rs_lock);
init_waitqueue_head(&rm->m_flush_wait);
out: out:
return rm; return rm;
} }
/*
* RDS ops use this to grab SG entries from the rm's sg pool.
*/
struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
{
struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
struct scatterlist *sg_ret;
WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
WARN_ON(!nents);
sg_ret = &sg_first[rm->m_used_sgs];
sg_init_table(sg_ret, nents);
rm->m_used_sgs += nents;
return sg_ret;
}
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
{ {
struct rds_message *rm; struct rds_message *rm;
unsigned int i; unsigned int i;
int num_sgs = ceil(total_len, PAGE_SIZE);
int extra_bytes = num_sgs * sizeof(struct scatterlist);
rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
if (rm == NULL) if (!rm)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
rm->m_nents = ceil(total_len, PAGE_SIZE); rm->data.op_nents = ceil(total_len, PAGE_SIZE);
rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
for (i = 0; i < rm->m_nents; ++i) { for (i = 0; i < rm->data.op_nents; ++i) {
sg_set_page(&rm->m_sg[i], sg_set_page(&rm->data.op_sg[i],
virt_to_page(page_addrs[i]), virt_to_page(page_addrs[i]),
PAGE_SIZE, 0); PAGE_SIZE, 0);
} }
...@@ -256,40 +276,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in ...@@ -256,40 +276,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
return rm; return rm;
} }
struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
size_t total_len) size_t total_len)
{ {
unsigned long to_copy; unsigned long to_copy;
unsigned long iov_off; unsigned long iov_off;
unsigned long sg_off; unsigned long sg_off;
struct rds_message *rm;
struct iovec *iov; struct iovec *iov;
struct scatterlist *sg; struct scatterlist *sg;
int ret; int ret = 0;
rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
if (rm == NULL) {
ret = -ENOMEM;
goto out;
}
rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
/* /*
* now allocate and copy in the data payload. * now allocate and copy in the data payload.
*/ */
sg = rm->m_sg; sg = rm->data.op_sg;
iov = first_iov; iov = first_iov;
iov_off = 0; iov_off = 0;
sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
while (total_len) { while (total_len) {
if (sg_page(sg) == NULL) { if (!sg_page(sg)) {
ret = rds_page_remainder_alloc(sg, total_len, ret = rds_page_remainder_alloc(sg, total_len,
GFP_HIGHUSER); GFP_HIGHUSER);
if (ret) if (ret)
goto out; goto out;
rm->m_nents++; rm->data.op_nents++;
sg_off = 0; sg_off = 0;
} }
...@@ -320,14 +333,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, ...@@ -320,14 +333,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
sg++; sg++;
} }
ret = 0;
out: out:
if (ret) { return ret;
if (rm)
rds_message_put(rm);
rm = ERR_PTR(ret);
}
return rm;
} }
int rds_message_inc_copy_to_user(struct rds_incoming *inc, int rds_message_inc_copy_to_user(struct rds_incoming *inc,
...@@ -348,7 +355,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, ...@@ -348,7 +355,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
iov = first_iov; iov = first_iov;
iov_off = 0; iov_off = 0;
sg = rm->m_sg; sg = rm->data.op_sg;
vec_off = 0; vec_off = 0;
copied = 0; copied = 0;
...@@ -394,15 +401,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, ...@@ -394,15 +401,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
*/ */
void rds_message_wait(struct rds_message *rm) void rds_message_wait(struct rds_message *rm)
{ {
wait_event(rds_message_flush_waitq, wait_event_interruptible(rm->m_flush_wait,
!test_bit(RDS_MSG_MAPPED, &rm->m_flags)); !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
} }
void rds_message_unmapped(struct rds_message *rm) void rds_message_unmapped(struct rds_message *rm)
{ {
clear_bit(RDS_MSG_MAPPED, &rm->m_flags); clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
if (waitqueue_active(&rds_message_flush_waitq)) wake_up_interruptible(&rm->m_flush_wait);
wake_up(&rds_message_flush_waitq);
} }
EXPORT_SYMBOL_GPL(rds_message_unmapped); EXPORT_SYMBOL_GPL(rds_message_unmapped);
...@@ -116,7 +116,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, ...@@ -116,7 +116,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
/* jump straight to allocation if we're trying for a huge page */ /* jump straight to allocation if we're trying for a huge page */
if (bytes >= PAGE_SIZE) { if (bytes >= PAGE_SIZE) {
page = alloc_page(gfp); page = alloc_page(gfp);
if (page == NULL) { if (!page) {
ret = -ENOMEM; ret = -ENOMEM;
} else { } else {
sg_set_page(scat, page, PAGE_SIZE, 0); sg_set_page(scat, page, PAGE_SIZE, 0);
...@@ -162,7 +162,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, ...@@ -162,7 +162,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
rem = &per_cpu(rds_page_remainders, get_cpu()); rem = &per_cpu(rds_page_remainders, get_cpu());
local_irq_save(flags); local_irq_save(flags);
if (page == NULL) { if (!page) {
ret = -ENOMEM; ret = -ENOMEM;
break; break;
} }
...@@ -186,6 +186,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, ...@@ -186,6 +186,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
ret ? 0 : scat->length); ret ? 0 : scat->length);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
static int rds_page_remainder_cpu_notify(struct notifier_block *self, static int rds_page_remainder_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu) unsigned long action, void *hcpu)
......
...@@ -35,7 +35,7 @@ ...@@ -35,7 +35,7 @@
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
#include "rdma.h" #include "rds.h"
/* /*
* XXX * XXX
...@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs) ...@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
{ {
struct rds_mr *mr; struct rds_mr *mr;
struct rb_node *node; struct rb_node *node;
unsigned long flags;
/* Release any MRs associated with this socket */ /* Release any MRs associated with this socket */
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
while ((node = rb_first(&rs->rs_rdma_keys))) { while ((node = rb_first(&rs->rs_rdma_keys))) {
mr = container_of(node, struct rds_mr, r_rb_node); mr = container_of(node, struct rds_mr, r_rb_node);
if (mr->r_trans == rs->rs_transport) if (mr->r_trans == rs->rs_transport)
mr->r_invalidate = 0; mr->r_invalidate = 0;
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
rds_destroy_mr(mr);
rds_mr_put(mr); rds_mr_put(mr);
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
} }
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (rs->rs_transport && rs->rs_transport->flush_mrs) if (rs->rs_transport && rs->rs_transport->flush_mrs)
rs->rs_transport->flush_mrs(); rs->rs_transport->flush_mrs();
...@@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, ...@@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto out; goto out;
} }
if (rs->rs_transport->get_mr == NULL) { if (!rs->rs_transport->get_mr) {
ret = -EOPNOTSUPP; ret = -EOPNOTSUPP;
goto out; goto out;
} }
...@@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, ...@@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
/* XXX clamp nr_pages to limit the size of this alloc? */ /* XXX clamp nr_pages to limit the size of this alloc? */
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (pages == NULL) { if (!pages) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
if (mr == NULL) { if (!mr) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
...@@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, ...@@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
* the zero page. * the zero page.
*/ */
ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
if (ret < 0) if (ret < 0)
goto out; goto out;
nents = ret; nents = ret;
sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
if (sg == NULL) { if (!sg) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
...@@ -406,68 +414,127 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) ...@@ -406,68 +414,127 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
spin_lock_irqsave(&rs->rs_rdma_lock, flags); spin_lock_irqsave(&rs->rs_rdma_lock, flags);
mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
if (mr && (mr->r_use_once || force)) { if (!mr) {
printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
return;
}
if (mr->r_use_once || force) {
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node); RB_CLEAR_NODE(&mr->r_rb_node);
zot_me = 1; zot_me = 1;
} else if (mr) }
atomic_inc(&mr->r_refcount);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
/* May have to issue a dma_sync on this memory region. /* May have to issue a dma_sync on this memory region.
* Note we could avoid this if the operation was a RDMA READ, * Note we could avoid this if the operation was a RDMA READ,
* but at this point we can't tell. */ * but at this point we can't tell. */
if (mr != NULL) { if (mr->r_trans->sync_mr)
if (mr->r_trans->sync_mr) mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
/* If the MR was marked as invalidate, this will
/* If the MR was marked as invalidate, this will * trigger an async flush. */
* trigger an async flush. */ if (zot_me)
if (zot_me) rds_destroy_mr(mr);
rds_destroy_mr(mr); rds_mr_put(mr);
rds_mr_put(mr);
}
} }
void rds_rdma_free_op(struct rds_rdma_op *ro) void rds_rdma_free_op(struct rm_rdma_op *ro)
{ {
unsigned int i; unsigned int i;
for (i = 0; i < ro->r_nents; i++) { for (i = 0; i < ro->op_nents; i++) {
struct page *page = sg_page(&ro->r_sg[i]); struct page *page = sg_page(&ro->op_sg[i]);
/* Mark page dirty if it was possibly modified, which /* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote * is the case for a RDMA_READ which copies from remote
* to local memory */ * to local memory */
if (!ro->r_write) { if (!ro->op_write) {
BUG_ON(in_interrupt()); BUG_ON(irqs_disabled());
set_page_dirty(page); set_page_dirty(page);
} }
put_page(page); put_page(page);
} }
kfree(ro->r_notifier); kfree(ro->op_notifier);
kfree(ro); ro->op_notifier = NULL;
ro->op_active = 0;
}
void rds_atomic_free_op(struct rm_atomic_op *ao)
{
struct page *page = sg_page(ao->op_sg);
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory */
set_page_dirty(page);
put_page(page);
kfree(ao->op_notifier);
ao->op_notifier = NULL;
ao->op_active = 0;
}
/*
* Count the number of pages needed to describe an incoming iovec.
*/
static int rds_rdma_pages(struct rds_rdma_args *args)
{
struct rds_iovec vec;
struct rds_iovec __user *local_vec;
unsigned int tot_pages = 0;
unsigned int nr_pages;
unsigned int i;
local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
/* figure out the number of pages in the vector */
for (i = 0; i < args->nr_local; i++) {
if (copy_from_user(&vec, &local_vec[i],
sizeof(struct rds_iovec)))
return -EFAULT;
nr_pages = rds_pages_in_vec(&vec);
if (nr_pages == 0)
return -EINVAL;
tot_pages += nr_pages;
}
return tot_pages;
}
int rds_rdma_extra_size(struct rds_rdma_args *args)
{
return rds_rdma_pages(args) * sizeof(struct scatterlist);
} }
/* /*
* args is a pointer to an in-kernel copy in the sendmsg cmsg. * The application asks for a RDMA transfer.
* Extract all arguments and set up the rdma_op
*/ */
static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct rds_rdma_args *args) struct cmsghdr *cmsg)
{ {
struct rds_rdma_args *args;
struct rds_iovec vec; struct rds_iovec vec;
struct rds_rdma_op *op = NULL; struct rm_rdma_op *op = &rm->rdma;
unsigned int nr_pages; unsigned int nr_pages;
unsigned int max_pages;
unsigned int nr_bytes; unsigned int nr_bytes;
struct page **pages = NULL; struct page **pages = NULL;
struct rds_iovec __user *local_vec; struct rds_iovec __user *local_vec;
struct scatterlist *sg;
unsigned int nr; unsigned int nr;
unsigned int i, j; unsigned int i, j;
int ret; int ret = 0;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
|| rm->rdma.op_active)
return -EINVAL;
args = CMSG_DATA(cmsg);
if (rs->rs_bound_addr == 0) { if (rs->rs_bound_addr == 0) {
ret = -ENOTCONN; /* XXX not a great errno */ ret = -ENOTCONN; /* XXX not a great errno */
...@@ -479,61 +546,38 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, ...@@ -479,61 +546,38 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
goto out; goto out;
} }
nr_pages = 0; nr_pages = rds_rdma_pages(args);
max_pages = 0; if (nr_pages < 0)
local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
/* figure out the number of pages in the vector */
for (i = 0; i < args->nr_local; i++) {
if (copy_from_user(&vec, &local_vec[i],
sizeof(struct rds_iovec))) {
ret = -EFAULT;
goto out;
}
nr = rds_pages_in_vec(&vec);
if (nr == 0) {
ret = -EINVAL;
goto out;
}
max_pages = max(nr, max_pages);
nr_pages += nr;
}
pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
if (pages == NULL) {
ret = -ENOMEM;
goto out; goto out;
}
op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (op == NULL) { if (!pages) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
op->r_write = !!(args->flags & RDS_RDMA_READWRITE); op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
op->r_fence = !!(args->flags & RDS_RDMA_FENCE); op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
op->r_recverr = rs->rs_recverr; op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
op->op_active = 1;
op->op_recverr = rs->rs_recverr;
WARN_ON(!nr_pages); WARN_ON(!nr_pages);
sg_init_table(op->r_sg, nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
if (op->r_notify || op->r_recverr) { if (op->op_notify || op->op_recverr) {
/* We allocate an uninitialized notifier here, because /* We allocate an uninitialized notifier here, because
* we don't want to do that in the completion handler. We * we don't want to do that in the completion handler. We
* would have to use GFP_ATOMIC there, and don't want to deal * would have to use GFP_ATOMIC there, and don't want to deal
* with failed allocations. * with failed allocations.
*/ */
op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
if (!op->r_notifier) { if (!op->op_notifier) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
op->r_notifier->n_user_token = args->user_token; op->op_notifier->n_user_token = args->user_token;
op->r_notifier->n_status = RDS_RDMA_SUCCESS; op->op_notifier->n_status = RDS_RDMA_SUCCESS;
} }
/* The cookie contains the R_Key of the remote memory region, and /* The cookie contains the R_Key of the remote memory region, and
...@@ -543,15 +587,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, ...@@ -543,15 +587,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
* destination address (which is really an offset into the MR) * destination address (which is really an offset into the MR)
* FIXME: We may want to move this into ib_rdma.c * FIXME: We may want to move this into ib_rdma.c
*/ */
op->r_key = rds_rdma_cookie_key(args->cookie); op->op_rkey = rds_rdma_cookie_key(args->cookie);
op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
nr_bytes = 0; nr_bytes = 0;
rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
(unsigned long long)args->nr_local, (unsigned long long)args->nr_local,
(unsigned long long)args->remote_vec.addr, (unsigned long long)args->remote_vec.addr,
op->r_key); op->op_rkey);
local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
for (i = 0; i < args->nr_local; i++) { for (i = 0; i < args->nr_local; i++) {
if (copy_from_user(&vec, &local_vec[i], if (copy_from_user(&vec, &local_vec[i],
...@@ -569,15 +615,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, ...@@ -569,15 +615,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
rs->rs_user_addr = vec.addr; rs->rs_user_addr = vec.addr;
rs->rs_user_bytes = vec.bytes; rs->rs_user_bytes = vec.bytes;
/* did the user change the vec under us? */
if (nr > max_pages || op->r_nents + nr > nr_pages) {
ret = -EINVAL;
goto out;
}
/* If it's a WRITE operation, we want to pin the pages for reading. /* If it's a WRITE operation, we want to pin the pages for reading.
* If it's a READ operation, we need to pin the pages for writing. * If it's a READ operation, we need to pin the pages for writing.
*/ */
ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write);
if (ret < 0) if (ret < 0)
goto out; goto out;
...@@ -588,8 +629,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, ...@@ -588,8 +629,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
for (j = 0; j < nr; j++) { for (j = 0; j < nr; j++) {
unsigned int offset = vec.addr & ~PAGE_MASK; unsigned int offset = vec.addr & ~PAGE_MASK;
struct scatterlist *sg;
sg = &op->r_sg[op->r_nents + j]; sg = &op->op_sg[op->op_nents + j];
sg_set_page(sg, pages[j], sg_set_page(sg, pages[j],
min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
offset); offset);
...@@ -601,10 +643,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, ...@@ -601,10 +643,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
vec.bytes -= sg->length; vec.bytes -= sg->length;
} }
op->r_nents += nr; op->op_nents += nr;
} }
if (nr_bytes > args->remote_vec.bytes) { if (nr_bytes > args->remote_vec.bytes) {
rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
nr_bytes, nr_bytes,
...@@ -612,38 +653,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, ...@@ -612,38 +653,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
ret = -EINVAL; ret = -EINVAL;
goto out; goto out;
} }
op->r_bytes = nr_bytes; op->op_bytes = nr_bytes;
ret = 0; ret = 0;
out: out:
kfree(pages); kfree(pages);
if (ret) { if (ret)
if (op) rds_rdma_free_op(op);
rds_rdma_free_op(op);
op = ERR_PTR(ret);
}
return op;
}
/*
* The application asks for a RDMA transfer.
* Extract all arguments and set up the rdma_op
*/
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg)
{
struct rds_rdma_op *op;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) ||
rm->m_rdma_op != NULL)
return -EINVAL;
op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
if (IS_ERR(op))
return PTR_ERR(op);
rds_stats_inc(s_send_rdma); rds_stats_inc(s_send_rdma);
rm->m_rdma_op = op;
return 0; return ret;
} }
/* /*
...@@ -673,7 +693,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, ...@@ -673,7 +693,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
spin_lock_irqsave(&rs->rs_rdma_lock, flags); spin_lock_irqsave(&rs->rs_rdma_lock, flags);
mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
if (mr == NULL) if (!mr)
err = -EINVAL; /* invalid r_key */ err = -EINVAL; /* invalid r_key */
else else
atomic_inc(&mr->r_refcount); atomic_inc(&mr->r_refcount);
...@@ -681,7 +701,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, ...@@ -681,7 +701,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
if (mr) { if (mr) {
mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
rm->m_rdma_mr = mr; rm->rdma.op_rdma_mr = mr;
} }
return err; return err;
} }
...@@ -699,5 +719,98 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, ...@@ -699,5 +719,98 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
rm->m_rdma_cookie != 0) rm->m_rdma_cookie != 0)
return -EINVAL; return -EINVAL;
return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
}
/*
* Fill in rds_message for an atomic request.
*/
int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg)
{
struct page *page = NULL;
struct rds_atomic_args *args;
int ret = 0;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
|| rm->atomic.op_active)
return -EINVAL;
args = CMSG_DATA(cmsg);
/* Nonmasked & masked cmsg ops converted to masked hw ops */
switch (cmsg->cmsg_type) {
case RDS_CMSG_ATOMIC_FADD:
rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
rm->atomic.op_m_fadd.add = args->fadd.add;
rm->atomic.op_m_fadd.nocarry_mask = 0;
break;
case RDS_CMSG_MASKED_ATOMIC_FADD:
rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
rm->atomic.op_m_fadd.add = args->m_fadd.add;
rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
break;
case RDS_CMSG_ATOMIC_CSWP:
rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
rm->atomic.op_m_cswp.compare = args->cswp.compare;
rm->atomic.op_m_cswp.swap = args->cswp.swap;
rm->atomic.op_m_cswp.compare_mask = ~0;
rm->atomic.op_m_cswp.swap_mask = ~0;
break;
case RDS_CMSG_MASKED_ATOMIC_CSWP:
rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
break;
default:
BUG(); /* should never happen */
}
rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
rm->atomic.op_active = 1;
rm->atomic.op_recverr = rs->rs_recverr;
rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
/* verify 8 byte-aligned */
if (args->local_addr & 0x7) {
ret = -EFAULT;
goto err;
}
ret = rds_pin_pages(args->local_addr, 1, &page, 1);
if (ret != 1)
goto err;
ret = 0;
sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
if (rm->atomic.op_notify || rm->atomic.op_recverr) {
/* We allocate an uninitialized notifier here, because
* we don't want to do that in the completion handler. We
* would have to use GFP_ATOMIC there, and don't want to deal
* with failed allocations.
*/
rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
if (!rm->atomic.op_notifier) {
ret = -ENOMEM;
goto err;
}
rm->atomic.op_notifier->n_user_token = args->user_token;
rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
}
rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
return ret;
err:
if (page)
put_page(page);
kfree(rm->atomic.op_notifier);
return ret;
} }
#ifndef _RDS_RDMA_H
#define _RDS_RDMA_H
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/scatterlist.h>
#include "rds.h"
struct rds_mr {
struct rb_node r_rb_node;
atomic_t r_refcount;
u32 r_key;
/* A copy of the creation flags */
unsigned int r_use_once:1;
unsigned int r_invalidate:1;
unsigned int r_write:1;
/* This is for RDS_MR_DEAD.
* It would be nice & consistent to make this part of the above
* bit field here, but we need to use test_and_set_bit.
*/
unsigned long r_state;
struct rds_sock *r_sock; /* back pointer to the socket that owns us */
struct rds_transport *r_trans;
void *r_trans_private;
};
/* Flags for mr->r_state */
#define RDS_MR_DEAD 0
struct rds_rdma_op {
u32 r_key;
u64 r_remote_addr;
unsigned int r_write:1;
unsigned int r_fence:1;
unsigned int r_notify:1;
unsigned int r_recverr:1;
unsigned int r_mapped:1;
struct rds_notifier *r_notifier;
unsigned int r_bytes;
unsigned int r_nents;
unsigned int r_count;
struct scatterlist r_sg[0];
};
static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
{
return r_key | (((u64) offset) << 32);
}
static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
{
return cookie;
}
static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
{
return cookie >> 32;
}
int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
void rds_rdma_drop_keys(struct rds_sock *rs);
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
void rds_rdma_free_op(struct rds_rdma_op *ro);
void rds_rdma_send_complete(struct rds_message *rm, int);
extern void __rds_put_mr_final(struct rds_mr *mr);
static inline void rds_mr_put(struct rds_mr *mr)
{
if (atomic_dec_and_test(&mr->r_refcount))
__rds_put_mr_final(mr);
}
#endif
...@@ -36,6 +36,34 @@ ...@@ -36,6 +36,34 @@
static struct rdma_cm_id *rds_rdma_listen_id; static struct rdma_cm_id *rds_rdma_listen_id;
static char *rds_cm_event_strings[] = {
#define RDS_CM_EVENT_STRING(foo) \
[RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
RDS_CM_EVENT_STRING(ADDR_RESOLVED),
RDS_CM_EVENT_STRING(ADDR_ERROR),
RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
RDS_CM_EVENT_STRING(ROUTE_ERROR),
RDS_CM_EVENT_STRING(CONNECT_REQUEST),
RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
RDS_CM_EVENT_STRING(CONNECT_ERROR),
RDS_CM_EVENT_STRING(UNREACHABLE),
RDS_CM_EVENT_STRING(REJECTED),
RDS_CM_EVENT_STRING(ESTABLISHED),
RDS_CM_EVENT_STRING(DISCONNECTED),
RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
RDS_CM_EVENT_STRING(MULTICAST_JOIN),
RDS_CM_EVENT_STRING(MULTICAST_ERROR),
RDS_CM_EVENT_STRING(ADDR_CHANGE),
RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
#undef RDS_CM_EVENT_STRING
};
static char *rds_cm_event_str(enum rdma_cm_event_type type)
{
return rds_str_array(rds_cm_event_strings,
ARRAY_SIZE(rds_cm_event_strings), type);
};
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event) struct rdma_cm_event *event)
{ {
...@@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, ...@@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rds_transport *trans; struct rds_transport *trans;
int ret = 0; int ret = 0;
rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
event->event); event->event, rds_cm_event_str(event->event));
if (cm_id->device->node_type == RDMA_NODE_RNIC) if (cm_id->device->node_type == RDMA_NODE_RNIC)
trans = &rds_iw_transport; trans = &rds_iw_transport;
...@@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, ...@@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
default: default:
/* things like device disconnect? */ /* things like device disconnect? */
printk(KERN_ERR "RDS: unknown event %u!\n", event->event); printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
event->event, rds_cm_event_str(event->event));
break; break;
} }
...@@ -117,12 +146,13 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, ...@@ -117,12 +146,13 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
if (conn) if (conn)
mutex_unlock(&conn->c_cm_lock); mutex_unlock(&conn->c_cm_lock);
rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
rds_cm_event_str(event->event), ret);
return ret; return ret;
} }
static int __init rds_rdma_listen_init(void) static int rds_rdma_listen_init(void)
{ {
struct sockaddr_in sin; struct sockaddr_in sin;
struct rdma_cm_id *cm_id; struct rdma_cm_id *cm_id;
...@@ -177,7 +207,7 @@ static void rds_rdma_listen_stop(void) ...@@ -177,7 +207,7 @@ static void rds_rdma_listen_stop(void)
} }
} }
int __init rds_rdma_init(void) int rds_rdma_init(void)
{ {
int ret; int ret;
......
...@@ -80,6 +80,7 @@ enum { ...@@ -80,6 +80,7 @@ enum {
/* Bits for c_flags */ /* Bits for c_flags */
#define RDS_LL_SEND_FULL 0 #define RDS_LL_SEND_FULL 0
#define RDS_RECONNECT_PENDING 1 #define RDS_RECONNECT_PENDING 1
#define RDS_IN_XMIT 2
struct rds_connection { struct rds_connection {
struct hlist_node c_hash_node; struct hlist_node c_hash_node;
...@@ -91,12 +92,13 @@ struct rds_connection { ...@@ -91,12 +92,13 @@ struct rds_connection {
struct rds_cong_map *c_lcong; struct rds_cong_map *c_lcong;
struct rds_cong_map *c_fcong; struct rds_cong_map *c_fcong;
struct mutex c_send_lock; /* protect send ring */
struct rds_message *c_xmit_rm; struct rds_message *c_xmit_rm;
unsigned long c_xmit_sg; unsigned long c_xmit_sg;
unsigned int c_xmit_hdr_off; unsigned int c_xmit_hdr_off;
unsigned int c_xmit_data_off; unsigned int c_xmit_data_off;
unsigned int c_xmit_atomic_sent;
unsigned int c_xmit_rdma_sent; unsigned int c_xmit_rdma_sent;
unsigned int c_xmit_data_sent;
spinlock_t c_lock; /* protect msg queues */ spinlock_t c_lock; /* protect msg queues */
u64 c_next_tx_seq; u64 c_next_tx_seq;
...@@ -116,11 +118,10 @@ struct rds_connection { ...@@ -116,11 +118,10 @@ struct rds_connection {
struct delayed_work c_conn_w; struct delayed_work c_conn_w;
struct work_struct c_down_w; struct work_struct c_down_w;
struct mutex c_cm_lock; /* protect conn state & cm */ struct mutex c_cm_lock; /* protect conn state & cm */
wait_queue_head_t c_waitq;
struct list_head c_map_item; struct list_head c_map_item;
unsigned long c_map_queued; unsigned long c_map_queued;
unsigned long c_map_offset;
unsigned long c_map_bytes;
unsigned int c_unacked_packets; unsigned int c_unacked_packets;
unsigned int c_unacked_bytes; unsigned int c_unacked_bytes;
...@@ -206,6 +207,48 @@ struct rds_incoming { ...@@ -206,6 +207,48 @@ struct rds_incoming {
rds_rdma_cookie_t i_rdma_cookie; rds_rdma_cookie_t i_rdma_cookie;
}; };
struct rds_mr {
struct rb_node r_rb_node;
atomic_t r_refcount;
u32 r_key;
/* A copy of the creation flags */
unsigned int r_use_once:1;
unsigned int r_invalidate:1;
unsigned int r_write:1;
/* This is for RDS_MR_DEAD.
* It would be nice & consistent to make this part of the above
* bit field here, but we need to use test_and_set_bit.
*/
unsigned long r_state;
struct rds_sock *r_sock; /* back pointer to the socket that owns us */
struct rds_transport *r_trans;
void *r_trans_private;
};
/* Flags for mr->r_state */
#define RDS_MR_DEAD 0
static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
{
return r_key | (((u64) offset) << 32);
}
static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
{
return cookie;
}
static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
{
return cookie >> 32;
}
/* atomic operation types */
#define RDS_ATOMIC_TYPE_CSWP 0
#define RDS_ATOMIC_TYPE_FADD 1
/* /*
* m_sock_item and m_conn_item are on lists that are serialized under * m_sock_item and m_conn_item are on lists that are serialized under
* conn->c_lock. m_sock_item has additional meaning in that once it is empty * conn->c_lock. m_sock_item has additional meaning in that once it is empty
...@@ -258,13 +301,71 @@ struct rds_message { ...@@ -258,13 +301,71 @@ struct rds_message {
* -> rs->rs_lock * -> rs->rs_lock
*/ */
spinlock_t m_rs_lock; spinlock_t m_rs_lock;
wait_queue_head_t m_flush_wait;
struct rds_sock *m_rs; struct rds_sock *m_rs;
struct rds_rdma_op *m_rdma_op;
/* cookie to send to remote, in rds header */
rds_rdma_cookie_t m_rdma_cookie; rds_rdma_cookie_t m_rdma_cookie;
struct rds_mr *m_rdma_mr;
unsigned int m_nents; unsigned int m_used_sgs;
unsigned int m_count; unsigned int m_total_sgs;
struct scatterlist m_sg[0];
void *m_final_op;
struct {
struct rm_atomic_op {
int op_type;
union {
struct {
uint64_t compare;
uint64_t swap;
uint64_t compare_mask;
uint64_t swap_mask;
} op_m_cswp;
struct {
uint64_t add;
uint64_t nocarry_mask;
} op_m_fadd;
};
u32 op_rkey;
u64 op_remote_addr;
unsigned int op_notify:1;
unsigned int op_recverr:1;
unsigned int op_mapped:1;
unsigned int op_silent:1;
unsigned int op_active:1;
struct scatterlist *op_sg;
struct rds_notifier *op_notifier;
struct rds_mr *op_rdma_mr;
} atomic;
struct rm_rdma_op {
u32 op_rkey;
u64 op_remote_addr;
unsigned int op_write:1;
unsigned int op_fence:1;
unsigned int op_notify:1;
unsigned int op_recverr:1;
unsigned int op_mapped:1;
unsigned int op_silent:1;
unsigned int op_active:1;
unsigned int op_bytes;
unsigned int op_nents;
unsigned int op_count;
struct scatterlist *op_sg;
struct rds_notifier *op_notifier;
struct rds_mr *op_rdma_mr;
} rdma;
struct rm_data_op {
unsigned int op_active:1;
unsigned int op_nents;
unsigned int op_count;
struct scatterlist *op_sg;
} data;
};
}; };
/* /*
...@@ -305,10 +406,6 @@ struct rds_notifier { ...@@ -305,10 +406,6 @@ struct rds_notifier {
* transport is responsible for other serialization, including * transport is responsible for other serialization, including
* rds_recv_incoming(). This is called in process context but * rds_recv_incoming(). This is called in process context but
* should try hard not to block. * should try hard not to block.
*
* @xmit_cong_map: This asks the transport to send the local bitmap down the
* given connection. XXX get a better story about the bitmap
* flag and header.
*/ */
#define RDS_TRANS_IB 0 #define RDS_TRANS_IB 0
...@@ -332,13 +429,11 @@ struct rds_transport { ...@@ -332,13 +429,11 @@ struct rds_transport {
void (*xmit_complete)(struct rds_connection *conn); void (*xmit_complete)(struct rds_connection *conn);
int (*xmit)(struct rds_connection *conn, struct rds_message *rm, int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off); unsigned int hdr_off, unsigned int sg, unsigned int off);
int (*xmit_cong_map)(struct rds_connection *conn, int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
struct rds_cong_map *map, unsigned long offset); int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
int (*recv)(struct rds_connection *conn); int (*recv)(struct rds_connection *conn);
int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
size_t size); size_t size);
void (*inc_purge)(struct rds_incoming *inc);
void (*inc_free)(struct rds_incoming *inc); void (*inc_free)(struct rds_incoming *inc);
int (*cm_handle_connect)(struct rdma_cm_id *cm_id, int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
...@@ -367,17 +462,11 @@ struct rds_sock { ...@@ -367,17 +462,11 @@ struct rds_sock {
* bound_addr used for both incoming and outgoing, no INADDR_ANY * bound_addr used for both incoming and outgoing, no INADDR_ANY
* support. * support.
*/ */
struct rb_node rs_bound_node; struct hlist_node rs_bound_node;
__be32 rs_bound_addr; __be32 rs_bound_addr;
__be32 rs_conn_addr; __be32 rs_conn_addr;
__be16 rs_bound_port; __be16 rs_bound_port;
__be16 rs_conn_port; __be16 rs_conn_port;
/*
* This is only used to communicate the transport between bind and
* initiating connections. All other trans use is referenced through
* the connection.
*/
struct rds_transport *rs_transport; struct rds_transport *rs_transport;
/* /*
...@@ -466,8 +555,8 @@ struct rds_statistics { ...@@ -466,8 +555,8 @@ struct rds_statistics {
uint64_t s_recv_ping; uint64_t s_recv_ping;
uint64_t s_send_queue_empty; uint64_t s_send_queue_empty;
uint64_t s_send_queue_full; uint64_t s_send_queue_full;
uint64_t s_send_sem_contention; uint64_t s_send_lock_contention;
uint64_t s_send_sem_queue_raced; uint64_t s_send_lock_queue_raced;
uint64_t s_send_immediate_retry; uint64_t s_send_immediate_retry;
uint64_t s_send_delayed_retry; uint64_t s_send_delayed_retry;
uint64_t s_send_drop_acked; uint64_t s_send_drop_acked;
...@@ -487,6 +576,7 @@ struct rds_statistics { ...@@ -487,6 +576,7 @@ struct rds_statistics {
}; };
/* af_rds.c */ /* af_rds.c */
char *rds_str_array(char **array, size_t elements, size_t index);
void rds_sock_addref(struct rds_sock *rs); void rds_sock_addref(struct rds_sock *rs);
void rds_sock_put(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs);
void rds_wake_sk_sleep(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs);
...@@ -521,15 +611,17 @@ void rds_cong_exit(void); ...@@ -521,15 +611,17 @@ void rds_cong_exit(void);
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
/* conn.c */ /* conn.c */
int __init rds_conn_init(void); int rds_conn_init(void);
void rds_conn_exit(void); void rds_conn_exit(void);
struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp); struct rds_transport *trans, gfp_t gfp);
struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp); struct rds_transport *trans, gfp_t gfp);
void rds_conn_shutdown(struct rds_connection *conn);
void rds_conn_destroy(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn);
void rds_conn_reset(struct rds_connection *conn); void rds_conn_reset(struct rds_connection *conn);
void rds_conn_drop(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn);
void rds_conn_connect_if_down(struct rds_connection *conn);
void rds_for_each_conn_info(struct socket *sock, unsigned int len, void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter, struct rds_info_iterator *iter,
struct rds_info_lengths *lens, struct rds_info_lengths *lens,
...@@ -566,7 +658,8 @@ rds_conn_connecting(struct rds_connection *conn) ...@@ -566,7 +658,8 @@ rds_conn_connecting(struct rds_connection *conn)
/* message.c */ /* message.c */
struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
size_t total_len); size_t total_len);
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
void rds_message_populate_header(struct rds_header *hdr, __be16 sport, void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
...@@ -580,7 +673,6 @@ int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *vers ...@@ -580,7 +673,6 @@ int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *vers
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
int rds_message_inc_copy_to_user(struct rds_incoming *inc, int rds_message_inc_copy_to_user(struct rds_incoming *inc,
struct iovec *first_iov, size_t size); struct iovec *first_iov, size_t size);
void rds_message_inc_purge(struct rds_incoming *inc);
void rds_message_inc_free(struct rds_incoming *inc); void rds_message_inc_free(struct rds_incoming *inc);
void rds_message_addref(struct rds_message *rm); void rds_message_addref(struct rds_message *rm);
void rds_message_put(struct rds_message *rm); void rds_message_put(struct rds_message *rm);
...@@ -636,14 +728,39 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); ...@@ -636,14 +728,39 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
void rds_send_drop_acked(struct rds_connection *conn, u64 ack, void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
is_acked_func is_acked); is_acked_func is_acked);
int rds_send_acked_before(struct rds_connection *conn, u64 seq);
void rds_send_remove_from_sock(struct list_head *messages, int status); void rds_send_remove_from_sock(struct list_head *messages, int status);
int rds_send_pong(struct rds_connection *conn, __be16 dport); int rds_send_pong(struct rds_connection *conn, __be16 dport);
struct rds_message *rds_send_get_message(struct rds_connection *, struct rds_message *rds_send_get_message(struct rds_connection *,
struct rds_rdma_op *); struct rm_rdma_op *);
/* rdma.c */ /* rdma.c */
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
void rds_rdma_drop_keys(struct rds_sock *rs);
int rds_rdma_extra_size(struct rds_rdma_args *args);
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
void rds_rdma_free_op(struct rm_rdma_op *ro);
void rds_atomic_free_op(struct rm_atomic_op *ao);
void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
extern void __rds_put_mr_final(struct rds_mr *mr);
static inline void rds_mr_put(struct rds_mr *mr)
{
if (atomic_dec_and_test(&mr->r_refcount))
__rds_put_mr_final(mr);
}
/* stats.c */ /* stats.c */
DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
...@@ -657,14 +774,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); ...@@ -657,14 +774,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
put_cpu(); \ put_cpu(); \
} while (0) } while (0)
#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
int __init rds_stats_init(void); int rds_stats_init(void);
void rds_stats_exit(void); void rds_stats_exit(void);
void rds_stats_info_copy(struct rds_info_iterator *iter, void rds_stats_info_copy(struct rds_info_iterator *iter,
uint64_t *values, const char *const *names, uint64_t *values, const char *const *names,
size_t nr); size_t nr);
/* sysctl.c */ /* sysctl.c */
int __init rds_sysctl_init(void); int rds_sysctl_init(void);
void rds_sysctl_exit(void); void rds_sysctl_exit(void);
extern unsigned long rds_sysctl_sndbuf_min; extern unsigned long rds_sysctl_sndbuf_min;
extern unsigned long rds_sysctl_sndbuf_default; extern unsigned long rds_sysctl_sndbuf_default;
...@@ -678,9 +795,10 @@ extern unsigned long rds_sysctl_trace_flags; ...@@ -678,9 +795,10 @@ extern unsigned long rds_sysctl_trace_flags;
extern unsigned int rds_sysctl_trace_level; extern unsigned int rds_sysctl_trace_level;
/* threads.c */ /* threads.c */
int __init rds_threads_init(void); int rds_threads_init(void);
void rds_threads_exit(void); void rds_threads_exit(void);
extern struct workqueue_struct *rds_wq; extern struct workqueue_struct *rds_wq;
void rds_queue_reconnect(struct rds_connection *conn);
void rds_connect_worker(struct work_struct *); void rds_connect_worker(struct work_struct *);
void rds_shutdown_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *);
void rds_send_worker(struct work_struct *); void rds_send_worker(struct work_struct *);
...@@ -691,9 +809,10 @@ void rds_connect_complete(struct rds_connection *conn); ...@@ -691,9 +809,10 @@ void rds_connect_complete(struct rds_connection *conn);
int rds_trans_register(struct rds_transport *trans); int rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans);
struct rds_transport *rds_trans_get_preferred(__be32 addr); struct rds_transport *rds_trans_get_preferred(__be32 addr);
void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail); unsigned int avail);
int __init rds_trans_init(void); int rds_trans_init(void);
void rds_trans_exit(void); void rds_trans_exit(void);
#endif #endif
...@@ -36,7 +36,6 @@ ...@@ -36,7 +36,6 @@
#include <linux/in.h> #include <linux/in.h>
#include "rds.h" #include "rds.h"
#include "rdma.h"
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr) __be32 saddr)
...@@ -210,7 +209,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, ...@@ -210,7 +209,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
} }
rs = rds_find_bound(daddr, inc->i_hdr.h_dport); rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
if (rs == NULL) { if (!rs) {
rds_stats_inc(s_recv_drop_no_sock); rds_stats_inc(s_recv_drop_no_sock);
goto out; goto out;
} }
...@@ -251,7 +250,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) ...@@ -251,7 +250,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
{ {
unsigned long flags; unsigned long flags;
if (*inc == NULL) { if (!*inc) {
read_lock_irqsave(&rs->rs_recv_lock, flags); read_lock_irqsave(&rs->rs_recv_lock, flags);
if (!list_empty(&rs->rs_recv_queue)) { if (!list_empty(&rs->rs_recv_queue)) {
*inc = list_entry(rs->rs_recv_queue.next, *inc = list_entry(rs->rs_recv_queue.next,
...@@ -334,10 +333,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) ...@@ -334,10 +333,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
if (msghdr) { if (msghdr) {
cmsg.user_token = notifier->n_user_token; cmsg.user_token = notifier->n_user_token;
cmsg.status = notifier->n_status; cmsg.status = notifier->n_status;
err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
sizeof(cmsg), &cmsg); sizeof(cmsg), &cmsg);
if (err) if (err)
break; break;
} }
......
...@@ -37,7 +37,6 @@ ...@@ -37,7 +37,6 @@
#include <linux/list.h> #include <linux/list.h>
#include "rds.h" #include "rds.h"
#include "rdma.h"
/* When transmitting messages in rds_send_xmit, we need to emerge from /* When transmitting messages in rds_send_xmit, we need to emerge from
* time to time and briefly release the CPU. Otherwise the softlock watchdog * time to time and briefly release the CPU. Otherwise the softlock watchdog
...@@ -54,7 +53,8 @@ module_param(send_batch_count, int, 0444); ...@@ -54,7 +53,8 @@ module_param(send_batch_count, int, 0444);
MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
/* /*
* Reset the send state. Caller must hold c_send_lock when calling here. * Reset the send state. Callers must ensure that this doesn't race with
* rds_send_xmit().
*/ */
void rds_send_reset(struct rds_connection *conn) void rds_send_reset(struct rds_connection *conn)
{ {
...@@ -62,18 +62,22 @@ void rds_send_reset(struct rds_connection *conn) ...@@ -62,18 +62,22 @@ void rds_send_reset(struct rds_connection *conn)
unsigned long flags; unsigned long flags;
if (conn->c_xmit_rm) { if (conn->c_xmit_rm) {
rm = conn->c_xmit_rm;
conn->c_xmit_rm = NULL;
/* Tell the user the RDMA op is no longer mapped by the /* Tell the user the RDMA op is no longer mapped by the
* transport. This isn't entirely true (it's flushed out * transport. This isn't entirely true (it's flushed out
* independently) but as the connection is down, there's * independently) but as the connection is down, there's
* no ongoing RDMA to/from that memory */ * no ongoing RDMA to/from that memory */
rds_message_unmapped(conn->c_xmit_rm); rds_message_unmapped(rm);
rds_message_put(conn->c_xmit_rm); rds_message_put(rm);
conn->c_xmit_rm = NULL;
} }
conn->c_xmit_sg = 0; conn->c_xmit_sg = 0;
conn->c_xmit_hdr_off = 0; conn->c_xmit_hdr_off = 0;
conn->c_xmit_data_off = 0; conn->c_xmit_data_off = 0;
conn->c_xmit_atomic_sent = 0;
conn->c_xmit_rdma_sent = 0; conn->c_xmit_rdma_sent = 0;
conn->c_xmit_data_sent = 0;
conn->c_map_queued = 0; conn->c_map_queued = 0;
...@@ -90,6 +94,25 @@ void rds_send_reset(struct rds_connection *conn) ...@@ -90,6 +94,25 @@ void rds_send_reset(struct rds_connection *conn)
spin_unlock_irqrestore(&conn->c_lock, flags); spin_unlock_irqrestore(&conn->c_lock, flags);
} }
static int acquire_in_xmit(struct rds_connection *conn)
{
return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
}
static void release_in_xmit(struct rds_connection *conn)
{
clear_bit(RDS_IN_XMIT, &conn->c_flags);
smp_mb__after_clear_bit();
/*
* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
* hot path and finding waiters is very rare. We don't want to walk
* the system-wide hashed waitqueue buckets in the fast path only to
* almost never find waiters.
*/
if (waitqueue_active(&conn->c_waitq))
wake_up_all(&conn->c_waitq);
}
/* /*
* We're making the concious trade-off here to only send one message * We're making the concious trade-off here to only send one message
* down the connection at a time. * down the connection at a time.
...@@ -109,102 +132,69 @@ int rds_send_xmit(struct rds_connection *conn) ...@@ -109,102 +132,69 @@ int rds_send_xmit(struct rds_connection *conn)
struct rds_message *rm; struct rds_message *rm;
unsigned long flags; unsigned long flags;
unsigned int tmp; unsigned int tmp;
unsigned int send_quota = send_batch_count;
struct scatterlist *sg; struct scatterlist *sg;
int ret = 0; int ret = 0;
int was_empty = 0;
LIST_HEAD(to_be_dropped); LIST_HEAD(to_be_dropped);
restart:
/* /*
* sendmsg calls here after having queued its message on the send * sendmsg calls here after having queued its message on the send
* queue. We only have one task feeding the connection at a time. If * queue. We only have one task feeding the connection at a time. If
* another thread is already feeding the queue then we back off. This * another thread is already feeding the queue then we back off. This
* avoids blocking the caller and trading per-connection data between * avoids blocking the caller and trading per-connection data between
* caches per message. * caches per message.
*
* The sem holder will issue a retry if they notice that someone queued
* a message after they stopped walking the send queue but before they
* dropped the sem.
*/ */
if (!mutex_trylock(&conn->c_send_lock)) { if (!acquire_in_xmit(conn)) {
rds_stats_inc(s_send_sem_contention); rds_stats_inc(s_send_lock_contention);
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
/*
* rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
* we do the opposite to avoid races.
*/
if (!rds_conn_up(conn)) {
release_in_xmit(conn);
ret = 0;
goto out;
}
if (conn->c_trans->xmit_prepare) if (conn->c_trans->xmit_prepare)
conn->c_trans->xmit_prepare(conn); conn->c_trans->xmit_prepare(conn);
/* /*
* spin trying to push headers and data down the connection until * spin trying to push headers and data down the connection until
* the connection doens't make forward progress. * the connection doesn't make forward progress.
*/ */
while (--send_quota) { while (1) {
/*
* See if need to send a congestion map update if we're
* between sending messages. The send_sem protects our sole
* use of c_map_offset and _bytes.
* Note this is used only by transports that define a special
* xmit_cong_map function. For all others, we create allocate
* a cong_map message and treat it just like any other send.
*/
if (conn->c_map_bytes) {
ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
conn->c_map_offset);
if (ret <= 0)
break;
conn->c_map_offset += ret;
conn->c_map_bytes -= ret;
if (conn->c_map_bytes)
continue;
}
/* If we're done sending the current message, clear the
* offset and S/G temporaries.
*/
rm = conn->c_xmit_rm; rm = conn->c_xmit_rm;
if (rm != NULL &&
conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
conn->c_xmit_sg == rm->m_nents) {
conn->c_xmit_rm = NULL;
conn->c_xmit_sg = 0;
conn->c_xmit_hdr_off = 0;
conn->c_xmit_data_off = 0;
conn->c_xmit_rdma_sent = 0;
/* Release the reference to the previous message. */ /*
rds_message_put(rm); * If between sending messages, we can send a pending congestion
rm = NULL; * map update.
}
/* If we're asked to send a cong map update, do so.
*/ */
if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
if (conn->c_trans->xmit_cong_map != NULL) {
conn->c_map_offset = 0;
conn->c_map_bytes = sizeof(struct rds_header) +
RDS_CONG_MAP_BYTES;
continue;
}
rm = rds_cong_update_alloc(conn); rm = rds_cong_update_alloc(conn);
if (IS_ERR(rm)) { if (IS_ERR(rm)) {
ret = PTR_ERR(rm); ret = PTR_ERR(rm);
break; break;
} }
rm->data.op_active = 1;
conn->c_xmit_rm = rm; conn->c_xmit_rm = rm;
} }
/* /*
* Grab the next message from the send queue, if there is one. * If not already working on one, grab the next message.
* *
* c_xmit_rm holds a ref while we're sending this message down * c_xmit_rm holds a ref while we're sending this message down
* the connction. We can use this ref while holding the * the connction. We can use this ref while holding the
* send_sem.. rds_send_reset() is serialized with it. * send_sem.. rds_send_reset() is serialized with it.
*/ */
if (rm == NULL) { if (!rm) {
unsigned int len; unsigned int len;
spin_lock_irqsave(&conn->c_lock, flags); spin_lock_irqsave(&conn->c_lock, flags);
...@@ -224,10 +214,8 @@ int rds_send_xmit(struct rds_connection *conn) ...@@ -224,10 +214,8 @@ int rds_send_xmit(struct rds_connection *conn)
spin_unlock_irqrestore(&conn->c_lock, flags); spin_unlock_irqrestore(&conn->c_lock, flags);
if (rm == NULL) { if (!rm)
was_empty = 1;
break; break;
}
/* Unfortunately, the way Infiniband deals with /* Unfortunately, the way Infiniband deals with
* RDMA to a bad MR key is by moving the entire * RDMA to a bad MR key is by moving the entire
...@@ -236,13 +224,12 @@ int rds_send_xmit(struct rds_connection *conn) ...@@ -236,13 +224,12 @@ int rds_send_xmit(struct rds_connection *conn)
* connection. * connection.
* Therefore, we never retransmit messages with RDMA ops. * Therefore, we never retransmit messages with RDMA ops.
*/ */
if (rm->m_rdma_op && if (rm->rdma.op_active &&
test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
spin_lock_irqsave(&conn->c_lock, flags); spin_lock_irqsave(&conn->c_lock, flags);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
list_move(&rm->m_conn_item, &to_be_dropped); list_move(&rm->m_conn_item, &to_be_dropped);
spin_unlock_irqrestore(&conn->c_lock, flags); spin_unlock_irqrestore(&conn->c_lock, flags);
rds_message_put(rm);
continue; continue;
} }
...@@ -263,23 +250,55 @@ int rds_send_xmit(struct rds_connection *conn) ...@@ -263,23 +250,55 @@ int rds_send_xmit(struct rds_connection *conn)
conn->c_xmit_rm = rm; conn->c_xmit_rm = rm;
} }
/* /* The transport either sends the whole rdma or none of it */
* Try and send an rdma message. Let's see if we can if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
* keep this simple and require that the transport either rm->m_final_op = &rm->rdma;
* send the whole rdma or none of it. ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
*/
if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
if (ret) if (ret)
break; break;
conn->c_xmit_rdma_sent = 1; conn->c_xmit_rdma_sent = 1;
/* The transport owns the mapped memory for now. /* The transport owns the mapped memory for now.
* You can't unmap it while it's on the send queue */ * You can't unmap it while it's on the send queue */
set_bit(RDS_MSG_MAPPED, &rm->m_flags); set_bit(RDS_MSG_MAPPED, &rm->m_flags);
} }
if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
conn->c_xmit_sg < rm->m_nents) { rm->m_final_op = &rm->atomic;
ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
if (ret)
break;
conn->c_xmit_atomic_sent = 1;
/* The transport owns the mapped memory for now.
* You can't unmap it while it's on the send queue */
set_bit(RDS_MSG_MAPPED, &rm->m_flags);
}
/*
* A number of cases require an RDS header to be sent
* even if there is no data.
* We permit 0-byte sends; rds-ping depends on this.
* However, if there are exclusively attached silent ops,
* we skip the hdr/data send, to enable silent operation.
*/
if (rm->data.op_nents == 0) {
int ops_present;
int all_ops_are_silent = 1;
ops_present = (rm->atomic.op_active || rm->rdma.op_active);
if (rm->atomic.op_active && !rm->atomic.op_silent)
all_ops_are_silent = 0;
if (rm->rdma.op_active && !rm->rdma.op_silent)
all_ops_are_silent = 0;
if (ops_present && all_ops_are_silent
&& !rm->m_rdma_cookie)
rm->data.op_active = 0;
}
if (rm->data.op_active && !conn->c_xmit_data_sent) {
rm->m_final_op = &rm->data;
ret = conn->c_trans->xmit(conn, rm, ret = conn->c_trans->xmit(conn, rm,
conn->c_xmit_hdr_off, conn->c_xmit_hdr_off,
conn->c_xmit_sg, conn->c_xmit_sg,
...@@ -295,7 +314,7 @@ int rds_send_xmit(struct rds_connection *conn) ...@@ -295,7 +314,7 @@ int rds_send_xmit(struct rds_connection *conn)
ret -= tmp; ret -= tmp;
} }
sg = &rm->m_sg[conn->c_xmit_sg]; sg = &rm->data.op_sg[conn->c_xmit_sg];
while (ret) { while (ret) {
tmp = min_t(int, ret, sg->length - tmp = min_t(int, ret, sg->length -
conn->c_xmit_data_off); conn->c_xmit_data_off);
...@@ -306,49 +325,63 @@ int rds_send_xmit(struct rds_connection *conn) ...@@ -306,49 +325,63 @@ int rds_send_xmit(struct rds_connection *conn)
sg++; sg++;
conn->c_xmit_sg++; conn->c_xmit_sg++;
BUG_ON(ret != 0 && BUG_ON(ret != 0 &&
conn->c_xmit_sg == rm->m_nents); conn->c_xmit_sg == rm->data.op_nents);
} }
} }
if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
(conn->c_xmit_sg == rm->data.op_nents))
conn->c_xmit_data_sent = 1;
} }
}
/* Nuke any messages we decided not to retransmit. */ /*
if (!list_empty(&to_be_dropped)) * A rm will only take multiple times through this loop
rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); * if there is a data op. Thus, if the data is sent (or there was
* none), then we're done with the rm.
*/
if (!rm->data.op_active || conn->c_xmit_data_sent) {
conn->c_xmit_rm = NULL;
conn->c_xmit_sg = 0;
conn->c_xmit_hdr_off = 0;
conn->c_xmit_data_off = 0;
conn->c_xmit_rdma_sent = 0;
conn->c_xmit_atomic_sent = 0;
conn->c_xmit_data_sent = 0;
rds_message_put(rm);
}
}
if (conn->c_trans->xmit_complete) if (conn->c_trans->xmit_complete)
conn->c_trans->xmit_complete(conn); conn->c_trans->xmit_complete(conn);
/* release_in_xmit(conn);
* We might be racing with another sender who queued a message but
* backed off on noticing that we held the c_send_lock. If we check
* for queued messages after dropping the sem then either we'll
* see the queued message or the queuer will get the sem. If we
* notice the queued message then we trigger an immediate retry.
*
* We need to be careful only to do this when we stopped processing
* the send queue because it was empty. It's the only way we
* stop processing the loop when the transport hasn't taken
* responsibility for forward progress.
*/
mutex_unlock(&conn->c_send_lock);
if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { /* Nuke any messages we decided not to retransmit. */
/* We exhausted the send quota, but there's work left to if (!list_empty(&to_be_dropped)) {
* do. Return and (re-)schedule the send worker. /* irqs on here, so we can put(), unlike above */
*/ list_for_each_entry(rm, &to_be_dropped, m_conn_item)
ret = -EAGAIN; rds_message_put(rm);
rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
} }
if (ret == 0 && was_empty) { /*
/* A simple bit test would be way faster than taking the * Other senders can queue a message after we last test the send queue
* spin lock */ * but before we clear RDS_IN_XMIT. In that case they'd back off and
spin_lock_irqsave(&conn->c_lock, flags); * not try and send their newly queued message. We need to check the
* send queue after having cleared RDS_IN_XMIT so that their message
* doesn't get stuck on the send queue.
*
* If the transport cannot continue (i.e ret != 0), then it must
* call us when more room is available, such as from the tx
* completion handler.
*/
if (ret == 0) {
smp_mb();
if (!list_empty(&conn->c_send_queue)) { if (!list_empty(&conn->c_send_queue)) {
rds_stats_inc(s_send_sem_queue_raced); rds_stats_inc(s_send_lock_queue_raced);
ret = -EAGAIN; goto restart;
} }
spin_unlock_irqrestore(&conn->c_lock, flags);
} }
out: out:
return ret; return ret;
...@@ -376,52 +409,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, ...@@ -376,52 +409,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
} }
/* /*
* Returns true if there are no messages on the send and retransmit queues * This is pretty similar to what happens below in the ACK
* which have a sequence number greater than or equal to the given sequence * handling code - except that we call here as soon as we get
* number. * the IB send completion on the RDMA op and the accompanying
* message.
*/ */
int rds_send_acked_before(struct rds_connection *conn, u64 seq) void rds_rdma_send_complete(struct rds_message *rm, int status)
{ {
struct rds_message *rm, *tmp; struct rds_sock *rs = NULL;
int ret = 1; struct rm_rdma_op *ro;
struct rds_notifier *notifier;
unsigned long flags;
spin_lock(&conn->c_lock); spin_lock_irqsave(&rm->m_rs_lock, flags);
list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { ro = &rm->rdma;
if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
ret = 0; ro->op_active && ro->op_notify && ro->op_notifier) {
break; notifier = ro->op_notifier;
} rs = rm->m_rs;
sock_hold(rds_rs_to_sk(rs));
list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { notifier->n_status = status;
if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) spin_lock(&rs->rs_lock);
ret = 0; list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
break; spin_unlock(&rs->rs_lock);
ro->op_notifier = NULL;
} }
spin_unlock(&conn->c_lock); spin_unlock_irqrestore(&rm->m_rs_lock, flags);
return ret; if (rs) {
rds_wake_sk_sleep(rs);
sock_put(rds_rs_to_sk(rs));
}
} }
EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
/* /*
* This is pretty similar to what happens below in the ACK * Just like above, except looks at atomic op
* handling code - except that we call here as soon as we get
* the IB send completion on the RDMA op and the accompanying
* message.
*/ */
void rds_rdma_send_complete(struct rds_message *rm, int status) void rds_atomic_send_complete(struct rds_message *rm, int status)
{ {
struct rds_sock *rs = NULL; struct rds_sock *rs = NULL;
struct rds_rdma_op *ro; struct rm_atomic_op *ao;
struct rds_notifier *notifier; struct rds_notifier *notifier;
unsigned long flags;
spin_lock(&rm->m_rs_lock); spin_lock_irqsave(&rm->m_rs_lock, flags);
ro = rm->m_rdma_op; ao = &rm->atomic;
if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
ro && ro->r_notify && ro->r_notifier) { && ao->op_active && ao->op_notify && ao->op_notifier) {
notifier = ro->r_notifier; notifier = ao->op_notifier;
rs = rm->m_rs; rs = rm->m_rs;
sock_hold(rds_rs_to_sk(rs)); sock_hold(rds_rs_to_sk(rs));
...@@ -430,17 +471,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) ...@@ -430,17 +471,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
list_add_tail(&notifier->n_list, &rs->rs_notify_queue); list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
spin_unlock(&rs->rs_lock); spin_unlock(&rs->rs_lock);
ro->r_notifier = NULL; ao->op_notifier = NULL;
} }
spin_unlock(&rm->m_rs_lock); spin_unlock_irqrestore(&rm->m_rs_lock, flags);
if (rs) { if (rs) {
rds_wake_sk_sleep(rs); rds_wake_sk_sleep(rs);
sock_put(rds_rs_to_sk(rs)); sock_put(rds_rs_to_sk(rs));
} }
} }
EXPORT_SYMBOL_GPL(rds_rdma_send_complete); EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
/* /*
* This is the same as rds_rdma_send_complete except we * This is the same as rds_rdma_send_complete except we
...@@ -448,15 +489,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete); ...@@ -448,15 +489,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
* socket, socket lock) and can just move the notifier. * socket, socket lock) and can just move the notifier.
*/ */
static inline void static inline void
__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
{ {
struct rds_rdma_op *ro; struct rm_rdma_op *ro;
struct rm_atomic_op *ao;
ro = &rm->rdma;
if (ro->op_active && ro->op_notify && ro->op_notifier) {
ro->op_notifier->n_status = status;
list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
ro->op_notifier = NULL;
}
ro = rm->m_rdma_op; ao = &rm->atomic;
if (ro && ro->r_notify && ro->r_notifier) { if (ao->op_active && ao->op_notify && ao->op_notifier) {
ro->r_notifier->n_status = status; ao->op_notifier->n_status = status;
list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
ro->r_notifier = NULL; ao->op_notifier = NULL;
} }
/* No need to wake the app - caller does this */ /* No need to wake the app - caller does this */
...@@ -468,7 +517,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status ...@@ -468,7 +517,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status
* So speed is not an issue here. * So speed is not an issue here.
*/ */
struct rds_message *rds_send_get_message(struct rds_connection *conn, struct rds_message *rds_send_get_message(struct rds_connection *conn,
struct rds_rdma_op *op) struct rm_rdma_op *op)
{ {
struct rds_message *rm, *tmp, *found = NULL; struct rds_message *rm, *tmp, *found = NULL;
unsigned long flags; unsigned long flags;
...@@ -476,7 +525,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, ...@@ -476,7 +525,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
spin_lock_irqsave(&conn->c_lock, flags); spin_lock_irqsave(&conn->c_lock, flags);
list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
if (rm->m_rdma_op == op) { if (&rm->rdma == op) {
atomic_inc(&rm->m_refcount); atomic_inc(&rm->m_refcount);
found = rm; found = rm;
goto out; goto out;
...@@ -484,7 +533,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, ...@@ -484,7 +533,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
} }
list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
if (rm->m_rdma_op == op) { if (&rm->rdma == op) {
atomic_inc(&rm->m_refcount); atomic_inc(&rm->m_refcount);
found = rm; found = rm;
break; break;
...@@ -544,19 +593,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) ...@@ -544,19 +593,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
spin_lock(&rs->rs_lock); spin_lock(&rs->rs_lock);
if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
struct rds_rdma_op *ro = rm->m_rdma_op; struct rm_rdma_op *ro = &rm->rdma;
struct rds_notifier *notifier; struct rds_notifier *notifier;
list_del_init(&rm->m_sock_item); list_del_init(&rm->m_sock_item);
rds_send_sndbuf_remove(rs, rm); rds_send_sndbuf_remove(rs, rm);
if (ro && ro->r_notifier && (status || ro->r_notify)) { if (ro->op_active && ro->op_notifier &&
notifier = ro->r_notifier; (ro->op_notify || (ro->op_recverr && status))) {
notifier = ro->op_notifier;
list_add_tail(&notifier->n_list, list_add_tail(&notifier->n_list,
&rs->rs_notify_queue); &rs->rs_notify_queue);
if (!notifier->n_status) if (!notifier->n_status)
notifier->n_status = status; notifier->n_status = status;
rm->m_rdma_op->r_notifier = NULL; rm->rdma.op_notifier = NULL;
} }
was_on_sock = 1; was_on_sock = 1;
rm->m_rs = NULL; rm->m_rs = NULL;
...@@ -619,9 +669,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) ...@@ -619,9 +669,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
{ {
struct rds_message *rm, *tmp; struct rds_message *rm, *tmp;
struct rds_connection *conn; struct rds_connection *conn;
unsigned long flags, flags2; unsigned long flags;
LIST_HEAD(list); LIST_HEAD(list);
int wake = 0;
/* get all the messages we're dropping under the rs lock */ /* get all the messages we're dropping under the rs lock */
spin_lock_irqsave(&rs->rs_lock, flags); spin_lock_irqsave(&rs->rs_lock, flags);
...@@ -631,59 +680,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) ...@@ -631,59 +680,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
dest->sin_port != rm->m_inc.i_hdr.h_dport)) dest->sin_port != rm->m_inc.i_hdr.h_dport))
continue; continue;
wake = 1;
list_move(&rm->m_sock_item, &list); list_move(&rm->m_sock_item, &list);
rds_send_sndbuf_remove(rs, rm); rds_send_sndbuf_remove(rs, rm);
clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
} }
/* order flag updates with the rs lock */ /* order flag updates with the rs lock */
if (wake) smp_mb__after_clear_bit();
smp_mb__after_clear_bit();
spin_unlock_irqrestore(&rs->rs_lock, flags); spin_unlock_irqrestore(&rs->rs_lock, flags);
conn = NULL; if (list_empty(&list))
return;
/* now remove the messages from the conn list as needed */ /* Remove the messages from the conn */
list_for_each_entry(rm, &list, m_sock_item) { list_for_each_entry(rm, &list, m_sock_item) {
/* We do this here rather than in the loop above, so that
* we don't have to nest m_rs_lock under rs->rs_lock */
spin_lock_irqsave(&rm->m_rs_lock, flags2);
/* If this is a RDMA operation, notify the app. */
spin_lock(&rs->rs_lock);
__rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
spin_unlock(&rs->rs_lock);
rm->m_rs = NULL;
spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
conn = rm->m_inc.i_conn;
spin_lock_irqsave(&conn->c_lock, flags);
/* /*
* If we see this flag cleared then we're *sure* that someone * Maybe someone else beat us to removing rm from the conn.
* else beat us to removing it from the conn. If we race * If we race with their flag update we'll get the lock and
* with their flag update we'll get the lock and then really * then really see that the flag has been cleared.
* see that the flag has been cleared.
*/ */
if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
spin_unlock_irqrestore(&conn->c_lock, flags);
continue; continue;
if (conn != rm->m_inc.i_conn) {
if (conn)
spin_unlock_irqrestore(&conn->c_lock, flags);
conn = rm->m_inc.i_conn;
spin_lock_irqsave(&conn->c_lock, flags);
} }
list_del_init(&rm->m_conn_item);
spin_unlock_irqrestore(&conn->c_lock, flags);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { /*
list_del_init(&rm->m_conn_item); * Couldn't grab m_rs_lock in top loop (lock ordering),
rds_message_put(rm); * but we can now.
} */
} spin_lock_irqsave(&rm->m_rs_lock, flags);
if (conn) spin_lock(&rs->rs_lock);
spin_unlock_irqrestore(&conn->c_lock, flags); __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
spin_unlock(&rs->rs_lock);
if (wake) rm->m_rs = NULL;
rds_wake_sk_sleep(rs); spin_unlock_irqrestore(&rm->m_rs_lock, flags);
rds_message_put(rm);
}
rds_wake_sk_sleep(rs);
while (!list_empty(&list)) { while (!list_empty(&list)) {
rm = list_entry(list.next, struct rds_message, m_sock_item); rm = list_entry(list.next, struct rds_message, m_sock_item);
...@@ -763,6 +807,63 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, ...@@ -763,6 +807,63 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
return *queued; return *queued;
} }
/*
* rds_message is getting to be quite complicated, and we'd like to allocate
* it all in one go. This figures out how big it needs to be up front.
*/
static int rds_rm_size(struct msghdr *msg, int data_len)
{
struct cmsghdr *cmsg;
int size = 0;
int cmsg_groups = 0;
int retval;
for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
if (!CMSG_OK(msg, cmsg))
return -EINVAL;
if (cmsg->cmsg_level != SOL_RDS)
continue;
switch (cmsg->cmsg_type) {
case RDS_CMSG_RDMA_ARGS:
cmsg_groups |= 1;
retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
if (retval < 0)
return retval;
size += retval;
break;
case RDS_CMSG_RDMA_DEST:
case RDS_CMSG_RDMA_MAP:
cmsg_groups |= 2;
/* these are valid but do no add any size */
break;
case RDS_CMSG_ATOMIC_CSWP:
case RDS_CMSG_ATOMIC_FADD:
case RDS_CMSG_MASKED_ATOMIC_CSWP:
case RDS_CMSG_MASKED_ATOMIC_FADD:
cmsg_groups |= 1;
size += sizeof(struct scatterlist);
break;
default:
return -EINVAL;
}
}
size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
if (cmsg_groups == 3)
return -EINVAL;
return size;
}
static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
struct msghdr *msg, int *allocated_mr) struct msghdr *msg, int *allocated_mr)
{ {
...@@ -777,7 +878,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, ...@@ -777,7 +878,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
continue; continue;
/* As a side effect, RDMA_DEST and RDMA_MAP will set /* As a side effect, RDMA_DEST and RDMA_MAP will set
* rm->m_rdma_cookie and rm->m_rdma_mr. * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
*/ */
switch (cmsg->cmsg_type) { switch (cmsg->cmsg_type) {
case RDS_CMSG_RDMA_ARGS: case RDS_CMSG_RDMA_ARGS:
...@@ -793,6 +894,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, ...@@ -793,6 +894,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
if (!ret) if (!ret)
*allocated_mr = 1; *allocated_mr = 1;
break; break;
case RDS_CMSG_ATOMIC_CSWP:
case RDS_CMSG_ATOMIC_FADD:
case RDS_CMSG_MASKED_ATOMIC_CSWP:
case RDS_CMSG_MASKED_ATOMIC_FADD:
ret = rds_cmsg_atomic(rs, rm, cmsg);
break;
default: default:
return -EINVAL; return -EINVAL;
...@@ -850,13 +957,26 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, ...@@ -850,13 +957,26 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
goto out; goto out;
} }
rm = rds_message_copy_from_user(msg->msg_iov, payload_len); /* size of rm including all sgs */
if (IS_ERR(rm)) { ret = rds_rm_size(msg, payload_len);
ret = PTR_ERR(rm); if (ret < 0)
rm = NULL; goto out;
rm = rds_message_alloc(ret, GFP_KERNEL);
if (!rm) {
ret = -ENOMEM;
goto out; goto out;
} }
/* Attach data to the rm */
if (payload_len) {
rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len);
if (ret)
goto out;
}
rm->data.op_active = 1;
rm->m_daddr = daddr; rm->m_daddr = daddr;
/* rds_conn_create has a spinlock that runs with IRQ off. /* rds_conn_create has a spinlock that runs with IRQ off.
...@@ -879,22 +999,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, ...@@ -879,22 +999,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
if (ret) if (ret)
goto out; goto out;
if ((rm->m_rdma_cookie || rm->m_rdma_op) && if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
conn->c_trans->xmit_rdma == NULL) {
if (printk_ratelimit()) if (printk_ratelimit())
printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
rm->m_rdma_op, conn->c_trans->xmit_rdma); &rm->rdma, conn->c_trans->xmit_rdma);
ret = -EOPNOTSUPP; ret = -EOPNOTSUPP;
goto out; goto out;
} }
/* If the connection is down, trigger a connect. We may if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
* have scheduled a delayed reconnect however - in this case if (printk_ratelimit())
* we should not interfere. printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
*/ &rm->atomic, conn->c_trans->xmit_atomic);
if (rds_conn_state(conn) == RDS_CONN_DOWN && ret = -EOPNOTSUPP;
!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) goto out;
queue_delayed_work(rds_wq, &conn->c_conn_w, 0); }
rds_conn_connect_if_down(conn);
ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
if (ret) { if (ret) {
...@@ -938,7 +1059,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, ...@@ -938,7 +1059,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
rds_stats_inc(s_send_queued); rds_stats_inc(s_send_queued);
if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
rds_send_worker(&conn->c_send_w.work); rds_send_xmit(conn);
rds_message_put(rm); rds_message_put(rm);
return payload_len; return payload_len;
...@@ -966,20 +1087,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) ...@@ -966,20 +1087,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
int ret = 0; int ret = 0;
rm = rds_message_alloc(0, GFP_ATOMIC); rm = rds_message_alloc(0, GFP_ATOMIC);
if (rm == NULL) { if (!rm) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
rm->m_daddr = conn->c_faddr; rm->m_daddr = conn->c_faddr;
rm->data.op_active = 1;
/* If the connection is down, trigger a connect. We may rds_conn_connect_if_down(conn);
* have scheduled a delayed reconnect however - in this case
* we should not interfere.
*/
if (rds_conn_state(conn) == RDS_CONN_DOWN &&
!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
if (ret) if (ret)
...@@ -999,7 +1115,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) ...@@ -999,7 +1115,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
rds_stats_inc(s_send_queued); rds_stats_inc(s_send_queued);
rds_stats_inc(s_send_pong); rds_stats_inc(s_send_pong);
queue_delayed_work(rds_wq, &conn->c_send_w, 0); if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
rds_send_xmit(conn);
rds_message_put(rm); rds_message_put(rm);
return 0; return 0;
......
...@@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = { ...@@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = {
"recv_ping", "recv_ping",
"send_queue_empty", "send_queue_empty",
"send_queue_full", "send_queue_full",
"send_sem_contention", "send_lock_contention",
"send_sem_queue_raced", "send_lock_queue_raced",
"send_immediate_retry", "send_immediate_retry",
"send_delayed_retry", "send_delayed_retry",
"send_drop_acked", "send_drop_acked",
...@@ -143,7 +143,7 @@ void rds_stats_exit(void) ...@@ -143,7 +143,7 @@ void rds_stats_exit(void)
rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
} }
int __init rds_stats_init(void) int rds_stats_init(void)
{ {
rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
return 0; return 0;
......
...@@ -105,13 +105,13 @@ void rds_sysctl_exit(void) ...@@ -105,13 +105,13 @@ void rds_sysctl_exit(void)
unregister_sysctl_table(rds_sysctl_reg_table); unregister_sysctl_table(rds_sysctl_reg_table);
} }
int __init rds_sysctl_init(void) int rds_sysctl_init(void)
{ {
rds_sysctl_reconnect_min = msecs_to_jiffies(1); rds_sysctl_reconnect_min = msecs_to_jiffies(1);
rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
if (rds_sysctl_reg_table == NULL) if (!rds_sysctl_reg_table)
return -ENOMEM; return -ENOMEM;
return 0; return 0;
} }
...@@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) ...@@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
struct rds_tcp_connection *tc; struct rds_tcp_connection *tc;
tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
if (tc == NULL) if (!tc)
return -ENOMEM; return -ENOMEM;
tc->t_sock = NULL; tc->t_sock = NULL;
...@@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = { ...@@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = {
.laddr_check = rds_tcp_laddr_check, .laddr_check = rds_tcp_laddr_check,
.xmit_prepare = rds_tcp_xmit_prepare, .xmit_prepare = rds_tcp_xmit_prepare,
.xmit_complete = rds_tcp_xmit_complete, .xmit_complete = rds_tcp_xmit_complete,
.xmit_cong_map = rds_tcp_xmit_cong_map,
.xmit = rds_tcp_xmit, .xmit = rds_tcp_xmit,
.recv = rds_tcp_recv, .recv = rds_tcp_recv,
.conn_alloc = rds_tcp_conn_alloc, .conn_alloc = rds_tcp_conn_alloc,
...@@ -266,7 +265,6 @@ struct rds_transport rds_tcp_transport = { ...@@ -266,7 +265,6 @@ struct rds_transport rds_tcp_transport = {
.conn_connect = rds_tcp_conn_connect, .conn_connect = rds_tcp_conn_connect,
.conn_shutdown = rds_tcp_conn_shutdown, .conn_shutdown = rds_tcp_conn_shutdown,
.inc_copy_to_user = rds_tcp_inc_copy_to_user, .inc_copy_to_user = rds_tcp_inc_copy_to_user,
.inc_purge = rds_tcp_inc_purge,
.inc_free = rds_tcp_inc_free, .inc_free = rds_tcp_inc_free,
.stats_info_copy = rds_tcp_stats_info_copy, .stats_info_copy = rds_tcp_stats_info_copy,
.exit = rds_tcp_exit, .exit = rds_tcp_exit,
...@@ -276,14 +274,14 @@ struct rds_transport rds_tcp_transport = { ...@@ -276,14 +274,14 @@ struct rds_transport rds_tcp_transport = {
.t_prefer_loopback = 1, .t_prefer_loopback = 1,
}; };
int __init rds_tcp_init(void) int rds_tcp_init(void)
{ {
int ret; int ret;
rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
sizeof(struct rds_tcp_connection), sizeof(struct rds_tcp_connection),
0, 0, NULL); 0, 0, NULL);
if (rds_tcp_conn_slab == NULL) { if (!rds_tcp_conn_slab) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
......
...@@ -43,7 +43,7 @@ struct rds_tcp_statistics { ...@@ -43,7 +43,7 @@ struct rds_tcp_statistics {
}; };
/* tcp.c */ /* tcp.c */
int __init rds_tcp_init(void); int rds_tcp_init(void);
void rds_tcp_exit(void); void rds_tcp_exit(void);
void rds_tcp_tune(struct socket *sock); void rds_tcp_tune(struct socket *sock);
void rds_tcp_nonagle(struct socket *sock); void rds_tcp_nonagle(struct socket *sock);
...@@ -61,16 +61,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn); ...@@ -61,16 +61,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn);
void rds_tcp_state_change(struct sock *sk); void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */ /* tcp_listen.c */
int __init rds_tcp_listen_init(void); int rds_tcp_listen_init(void);
void rds_tcp_listen_stop(void); void rds_tcp_listen_stop(void);
void rds_tcp_listen_data_ready(struct sock *sk, int bytes); void rds_tcp_listen_data_ready(struct sock *sk, int bytes);
/* tcp_recv.c */ /* tcp_recv.c */
int __init rds_tcp_recv_init(void); int rds_tcp_recv_init(void);
void rds_tcp_recv_exit(void); void rds_tcp_recv_exit(void);
void rds_tcp_data_ready(struct sock *sk, int bytes); void rds_tcp_data_ready(struct sock *sk, int bytes);
int rds_tcp_recv(struct rds_connection *conn); int rds_tcp_recv(struct rds_connection *conn);
void rds_tcp_inc_purge(struct rds_incoming *inc);
void rds_tcp_inc_free(struct rds_incoming *inc); void rds_tcp_inc_free(struct rds_incoming *inc);
int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
size_t size); size_t size);
...@@ -81,8 +80,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn); ...@@ -81,8 +80,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn);
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off); unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_tcp_write_space(struct sock *sk); void rds_tcp_write_space(struct sock *sk);
int rds_tcp_xmit_cong_map(struct rds_connection *conn,
struct rds_cong_map *map, unsigned long offset);
/* tcp_stats.c */ /* tcp_stats.c */
DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
......
...@@ -45,7 +45,7 @@ void rds_tcp_state_change(struct sock *sk) ...@@ -45,7 +45,7 @@ void rds_tcp_state_change(struct sock *sk)
read_lock(&sk->sk_callback_lock); read_lock(&sk->sk_callback_lock);
conn = sk->sk_user_data; conn = sk->sk_user_data;
if (conn == NULL) { if (!conn) {
state_change = sk->sk_state_change; state_change = sk->sk_state_change;
goto out; goto out;
} }
......
...@@ -116,7 +116,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) ...@@ -116,7 +116,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
read_lock(&sk->sk_callback_lock); read_lock(&sk->sk_callback_lock);
ready = sk->sk_user_data; ready = sk->sk_user_data;
if (ready == NULL) { /* check for teardown race */ if (!ready) { /* check for teardown race */
ready = sk->sk_data_ready; ready = sk->sk_data_ready;
goto out; goto out;
} }
...@@ -135,7 +135,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) ...@@ -135,7 +135,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
ready(sk, bytes); ready(sk, bytes);
} }
int __init rds_tcp_listen_init(void) int rds_tcp_listen_init(void)
{ {
struct sockaddr_in sin; struct sockaddr_in sin;
struct socket *sock = NULL; struct socket *sock = NULL;
...@@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void) ...@@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void)
struct socket *sock = rds_tcp_listen_sock; struct socket *sock = rds_tcp_listen_sock;
struct sock *sk; struct sock *sk;
if (sock == NULL) if (!sock)
return; return;
sk = sock->sk; sk = sock->sk;
......
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
static struct kmem_cache *rds_tcp_incoming_slab; static struct kmem_cache *rds_tcp_incoming_slab;
void rds_tcp_inc_purge(struct rds_incoming *inc) static void rds_tcp_inc_purge(struct rds_incoming *inc)
{ {
struct rds_tcp_incoming *tinc; struct rds_tcp_incoming *tinc;
tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
...@@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, ...@@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
* processing. * processing.
*/ */
while (left) { while (left) {
if (tinc == NULL) { if (!tinc) {
tinc = kmem_cache_alloc(rds_tcp_incoming_slab, tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
arg->gfp); arg->gfp);
if (tinc == NULL) { if (!tinc) {
desc->error = -ENOMEM; desc->error = -ENOMEM;
goto out; goto out;
} }
...@@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, ...@@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
if (left && tc->t_tinc_data_rem) { if (left && tc->t_tinc_data_rem) {
clone = skb_clone(skb, arg->gfp); clone = skb_clone(skb, arg->gfp);
if (clone == NULL) { if (!clone) {
desc->error = -ENOMEM; desc->error = -ENOMEM;
goto out; goto out;
} }
...@@ -326,7 +326,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) ...@@ -326,7 +326,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
read_lock(&sk->sk_callback_lock); read_lock(&sk->sk_callback_lock);
conn = sk->sk_user_data; conn = sk->sk_user_data;
if (conn == NULL) { /* check for teardown race */ if (!conn) { /* check for teardown race */
ready = sk->sk_data_ready; ready = sk->sk_data_ready;
goto out; goto out;
} }
...@@ -342,12 +342,12 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) ...@@ -342,12 +342,12 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
ready(sk, bytes); ready(sk, bytes);
} }
int __init rds_tcp_recv_init(void) int rds_tcp_recv_init(void)
{ {
rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
sizeof(struct rds_tcp_incoming), sizeof(struct rds_tcp_incoming),
0, 0, NULL); 0, 0, NULL);
if (rds_tcp_incoming_slab == NULL) if (!rds_tcp_incoming_slab)
return -ENOMEM; return -ENOMEM;
return 0; return 0;
} }
......
...@@ -76,56 +76,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) ...@@ -76,56 +76,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len); return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
} }
/* the core send_sem serializes this with other xmit and shutdown */
int rds_tcp_xmit_cong_map(struct rds_connection *conn,
struct rds_cong_map *map, unsigned long offset)
{
static struct rds_header rds_tcp_map_header = {
.h_flags = RDS_FLAG_CONG_BITMAP,
};
struct rds_tcp_connection *tc = conn->c_transport_data;
unsigned long i;
int ret;
int copied = 0;
/* Some problem claims cpu_to_be32(constant) isn't a constant. */
rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES);
if (offset < sizeof(struct rds_header)) {
ret = rds_tcp_sendmsg(tc->t_sock,
(void *)&rds_tcp_map_header + offset,
sizeof(struct rds_header) - offset);
if (ret <= 0)
return ret;
offset += ret;
copied = ret;
if (offset < sizeof(struct rds_header))
return ret;
}
offset -= sizeof(struct rds_header);
i = offset / PAGE_SIZE;
offset = offset % PAGE_SIZE;
BUG_ON(i >= RDS_CONG_MAP_PAGES);
do {
ret = tc->t_sock->ops->sendpage(tc->t_sock,
virt_to_page(map->m_page_addrs[i]),
offset, PAGE_SIZE - offset,
MSG_DONTWAIT);
if (ret <= 0)
break;
copied += ret;
offset += ret;
if (offset == PAGE_SIZE) {
offset = 0;
i++;
}
} while (i < RDS_CONG_MAP_PAGES);
return copied ? copied : ret;
}
/* the core send_sem serializes this with other xmit and shutdown */ /* the core send_sem serializes this with other xmit and shutdown */
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off) unsigned int hdr_off, unsigned int sg, unsigned int off)
...@@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, ...@@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
goto out; goto out;
} }
while (sg < rm->m_nents) { while (sg < rm->data.op_nents) {
ret = tc->t_sock->ops->sendpage(tc->t_sock, ret = tc->t_sock->ops->sendpage(tc->t_sock,
sg_page(&rm->m_sg[sg]), sg_page(&rm->data.op_sg[sg]),
rm->m_sg[sg].offset + off, rm->data.op_sg[sg].offset + off,
rm->m_sg[sg].length - off, rm->data.op_sg[sg].length - off,
MSG_DONTWAIT|MSG_NOSIGNAL); MSG_DONTWAIT|MSG_NOSIGNAL);
rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]), rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off, rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
ret); ret);
if (ret <= 0) if (ret <= 0)
break; break;
off += ret; off += ret;
done += ret; done += ret;
if (off == rm->m_sg[sg].length) { if (off == rm->data.op_sg[sg].length) {
off = 0; off = 0;
sg++; sg++;
} }
...@@ -226,7 +176,7 @@ void rds_tcp_write_space(struct sock *sk) ...@@ -226,7 +176,7 @@ void rds_tcp_write_space(struct sock *sk)
read_lock(&sk->sk_callback_lock); read_lock(&sk->sk_callback_lock);
conn = sk->sk_user_data; conn = sk->sk_user_data;
if (conn == NULL) { if (!conn) {
write_space = sk->sk_write_space; write_space = sk->sk_write_space;
goto out; goto out;
} }
......
...@@ -61,7 +61,7 @@ ...@@ -61,7 +61,7 @@
* *
* Transition to state DISCONNECTING/DOWN: * Transition to state DISCONNECTING/DOWN:
* - Inside the shutdown worker; synchronizes with xmit path * - Inside the shutdown worker; synchronizes with xmit path
* through c_send_lock, and with connection management callbacks * through RDS_IN_XMIT, and with connection management callbacks
* via c_cm_lock. * via c_cm_lock.
* *
* For receive callbacks, we rely on the underlying transport * For receive callbacks, we rely on the underlying transport
...@@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete); ...@@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
* We should *always* start with a random backoff; otherwise a broken connection * We should *always* start with a random backoff; otherwise a broken connection
* will always take several iterations to be re-established. * will always take several iterations to be re-established.
*/ */
static void rds_queue_reconnect(struct rds_connection *conn) void rds_queue_reconnect(struct rds_connection *conn)
{ {
unsigned long rand; unsigned long rand;
...@@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work) ...@@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work)
} }
} }
void rds_shutdown_worker(struct work_struct *work)
{
struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
/* shut it down unless it's down already */
if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
/*
* Quiesce the connection mgmt handlers before we start tearing
* things down. We don't hold the mutex for the entire
* duration of the shutdown operation, else we may be
* deadlocking with the CM handler. Instead, the CM event
* handler is supposed to check for state DISCONNECTING
*/
mutex_lock(&conn->c_cm_lock);
if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) &&
!rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
rds_conn_error(conn, "shutdown called in state %d\n",
atomic_read(&conn->c_state));
mutex_unlock(&conn->c_cm_lock);
return;
}
mutex_unlock(&conn->c_cm_lock);
mutex_lock(&conn->c_send_lock);
conn->c_trans->conn_shutdown(conn);
rds_conn_reset(conn);
mutex_unlock(&conn->c_send_lock);
if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
/* This can happen - eg when we're in the middle of tearing
* down the connection, and someone unloads the rds module.
* Quite reproduceable with loopback connections.
* Mostly harmless.
*/
rds_conn_error(conn,
"%s: failed to transition to state DOWN, "
"current state is %d\n",
__func__,
atomic_read(&conn->c_state));
return;
}
}
/* Then reconnect if it's still live.
* The passive side of an IB loopback connection is never added
* to the conn hash, so we never trigger a reconnect on this
* conn - the reconnect is always triggered by the active peer. */
cancel_delayed_work(&conn->c_conn_w);
if (!hlist_unhashed(&conn->c_hash_node))
rds_queue_reconnect(conn);
}
void rds_send_worker(struct work_struct *work) void rds_send_worker(struct work_struct *work)
{ {
struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
...@@ -252,15 +200,22 @@ void rds_recv_worker(struct work_struct *work) ...@@ -252,15 +200,22 @@ void rds_recv_worker(struct work_struct *work)
} }
} }
void rds_shutdown_worker(struct work_struct *work)
{
struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
rds_conn_shutdown(conn);
}
void rds_threads_exit(void) void rds_threads_exit(void)
{ {
destroy_workqueue(rds_wq); destroy_workqueue(rds_wq);
} }
int __init rds_threads_init(void) int rds_threads_init(void)
{ {
rds_wq = create_workqueue("krdsd"); rds_wq = create_singlethread_workqueue("krdsd");
if (rds_wq == NULL) if (!rds_wq)
return -ENOMEM; return -ENOMEM;
return 0; return 0;
......
...@@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans) ...@@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans)
} }
EXPORT_SYMBOL_GPL(rds_trans_unregister); EXPORT_SYMBOL_GPL(rds_trans_unregister);
void rds_trans_put(struct rds_transport *trans)
{
if (trans && trans->t_owner)
module_put(trans->t_owner);
}
struct rds_transport *rds_trans_get_preferred(__be32 addr) struct rds_transport *rds_trans_get_preferred(__be32 addr)
{ {
struct rds_transport *ret = NULL; struct rds_transport *ret = NULL;
int i; struct rds_transport *trans;
unsigned int i;
if (IN_LOOPBACK(ntohl(addr))) if (IN_LOOPBACK(ntohl(addr)))
return &rds_loop_transport; return &rds_loop_transport;
down_read(&rds_trans_sem); down_read(&rds_trans_sem);
for (i = 0; i < RDS_TRANS_COUNT; i++) for (i = 0; i < RDS_TRANS_COUNT; i++) {
{ trans = transports[i];
if (transports[i] && (transports[i]->laddr_check(addr) == 0)) {
ret = transports[i]; if (trans && (trans->laddr_check(addr) == 0) &&
(!trans->t_owner || try_module_get(trans->t_owner))) {
ret = trans;
break; break;
} }
} }
......
#ifndef _LINUX_XLIST_H
#define _LINUX_XLIST_H
#include <linux/stddef.h>
#include <linux/poison.h>
#include <linux/prefetch.h>
#include <asm/system.h>
struct xlist_head {
struct xlist_head *next;
};
static inline void INIT_XLIST_HEAD(struct xlist_head *list)
{
list->next = NULL;
}
static inline int xlist_empty(struct xlist_head *head)
{
return head->next == NULL;
}
static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail,
struct xlist_head *head)
{
struct xlist_head *cur;
struct xlist_head *check;
while (1) {
cur = head->next;
tail->next = cur;
check = cmpxchg(&head->next, cur, new);
if (check == cur)
break;
}
}
static inline struct xlist_head *xlist_del_head(struct xlist_head *head)
{
struct xlist_head *cur;
struct xlist_head *check;
struct xlist_head *next;
while (1) {
cur = head->next;
if (!cur)
goto out;
next = cur->next;
check = cmpxchg(&head->next, cur, next);
if (check == cur)
goto out;
}
out:
return cur;
}
static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head)
{
struct xlist_head *cur;
cur = head->next;
if (!cur)
return NULL;
head->next = cur->next;
return cur;
}
static inline void xlist_splice(struct xlist_head *list,
struct xlist_head *head)
{
struct xlist_head *cur;
WARN_ON(head->next);
cur = xchg(&list->next, NULL);
head->next = cur;
}
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment