Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
linux
Commits
cf0ac2b8
Commit
cf0ac2b8
authored
Sep 09, 2010
by
David S. Miller
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'for-davem' of
git://oss.oracle.com/git/agrover/linux-2.6
parents
f27e21a8
905d64c8
Changes
42
Hide whitespace changes
Inline
Side-by-side
Showing
42 changed files
with
2614 additions
and
1613 deletions
+2614
-1613
include/linux/Kbuild
include/linux/Kbuild
+1
-0
include/linux/rds.h
include/linux/rds.h
+65
-41
net/rds/af_rds.c
net/rds/af_rds.c
+22
-4
net/rds/bind.c
net/rds/bind.c
+42
-40
net/rds/cong.c
net/rds/cong.c
+4
-4
net/rds/connection.c
net/rds/connection.c
+114
-43
net/rds/ib.c
net/rds/ib.c
+154
-40
net/rds/ib.h
net/rds/ib.h
+49
-51
net/rds/ib_cm.c
net/rds/ib_cm.c
+126
-57
net/rds/ib_rdma.c
net/rds/ib_rdma.c
+245
-69
net/rds/ib_recv.c
net/rds/ib_recv.c
+332
-217
net/rds/ib_send.c
net/rds/ib_send.c
+408
-274
net/rds/ib_stats.c
net/rds/ib_stats.c
+2
-0
net/rds/ib_sysctl.c
net/rds/ib_sysctl.c
+2
-15
net/rds/info.c
net/rds/info.c
+6
-6
net/rds/iw.c
net/rds/iw.c
+1
-3
net/rds/iw.h
net/rds/iw.h
+5
-6
net/rds/iw_cm.c
net/rds/iw_cm.c
+7
-7
net/rds/iw_rdma.c
net/rds/iw_rdma.c
+0
-1
net/rds/iw_recv.c
net/rds/iw_recv.c
+12
-12
net/rds/iw_send.c
net/rds/iw_send.c
+47
-46
net/rds/iw_sysctl.c
net/rds/iw_sysctl.c
+2
-2
net/rds/loop.c
net/rds/loop.c
+17
-14
net/rds/message.c
net/rds/message.c
+62
-56
net/rds/page.c
net/rds/page.c
+3
-2
net/rds/rdma.c
net/rds/rdma.c
+226
-113
net/rds/rdma.h
net/rds/rdma.h
+0
-85
net/rds/rdma_transport.c
net/rds/rdma_transport.c
+36
-6
net/rds/rds.h
net/rds/rds.h
+153
-34
net/rds/recv.c
net/rds/recv.c
+4
-5
net/rds/send.c
net/rds/send.c
+331
-213
net/rds/stats.c
net/rds/stats.c
+3
-3
net/rds/sysctl.c
net/rds/sysctl.c
+2
-2
net/rds/tcp.c
net/rds/tcp.c
+3
-5
net/rds/tcp.h
net/rds/tcp.h
+3
-6
net/rds/tcp_connect.c
net/rds/tcp_connect.c
+1
-1
net/rds/tcp_listen.c
net/rds/tcp_listen.c
+3
-3
net/rds/tcp_recv.c
net/rds/tcp_recv.c
+7
-7
net/rds/tcp_send.c
net/rds/tcp_send.c
+8
-58
net/rds/threads.c
net/rds/threads.c
+12
-57
net/rds/transport.c
net/rds/transport.c
+14
-5
net/rds/xlist.h
net/rds/xlist.h
+80
-0
No files found.
include/linux/Kbuild
View file @
cf0ac2b8
...
...
@@ -302,6 +302,7 @@ header-y += quota.h
header-y += radeonfb.h
header-y += random.h
header-y += raw.h
header-y += rds.h
header-y += reboot.h
header-y += reiserfs_fs.h
header-y += reiserfs_xattr.h
...
...
include/linux/rds.h
View file @
cf0ac2b8
...
...
@@ -73,6 +73,10 @@
#define RDS_CMSG_RDMA_MAP 3
#define RDS_CMSG_RDMA_STATUS 4
#define RDS_CMSG_CONG_UPDATE 5
#define RDS_CMSG_ATOMIC_FADD 6
#define RDS_CMSG_ATOMIC_CSWP 7
#define RDS_CMSG_MASKED_ATOMIC_FADD 8
#define RDS_CMSG_MASKED_ATOMIC_CSWP 9
#define RDS_INFO_FIRST 10000
#define RDS_INFO_COUNTERS 10000
...
...
@@ -89,9 +93,9 @@
#define RDS_INFO_LAST 10010
struct
rds_info_counter
{
u
_
int8_t
name
[
32
];
u
_
int64_t
value
;
}
__
packed
;
uint8_t
name
[
32
];
uint64_t
value
;
}
__
attribute__
((
packed
))
;
#define RDS_INFO_CONNECTION_FLAG_SENDING 0x01
#define RDS_INFO_CONNECTION_FLAG_CONNECTING 0x02
...
...
@@ -100,56 +104,48 @@ struct rds_info_counter {
#define TRANSNAMSIZ 16
struct
rds_info_connection
{
u
_
int64_t
next_tx_seq
;
u
_
int64_t
next_rx_seq
;
uint64_t
next_tx_seq
;
uint64_t
next_rx_seq
;
__be32
laddr
;
__be32
faddr
;
u_int8_t
transport
[
TRANSNAMSIZ
];
/* null term ascii */
u_int8_t
flags
;
}
__packed
;
struct
rds_info_flow
{
__be32
laddr
;
__be32
faddr
;
u_int32_t
bytes
;
__be16
lport
;
__be16
fport
;
}
__packed
;
uint8_t
transport
[
TRANSNAMSIZ
];
/* null term ascii */
uint8_t
flags
;
}
__attribute__
((
packed
));
#define RDS_INFO_MESSAGE_FLAG_ACK 0x01
#define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02
struct
rds_info_message
{
u
_
int64_t
seq
;
u
_
int32_t
len
;
uint64_t
seq
;
uint32_t
len
;
__be32
laddr
;
__be32
faddr
;
__be16
lport
;
__be16
fport
;
u
_
int8_t
flags
;
}
__
packed
;
uint8_t
flags
;
}
__
attribute__
((
packed
))
;
struct
rds_info_socket
{
u
_
int32_t
sndbuf
;
uint32_t
sndbuf
;
__be32
bound_addr
;
__be32
connected_addr
;
__be16
bound_port
;
__be16
connected_port
;
u
_
int32_t
rcvbuf
;
u
_
int64_t
inum
;
}
__
packed
;
uint32_t
rcvbuf
;
uint64_t
inum
;
}
__
attribute__
((
packed
))
;
struct
rds_info_tcp_socket
{
__be32
local_addr
;
__be16
local_port
;
__be32
peer_addr
;
__be16
peer_port
;
u
_
int64_t
hdr_rem
;
u
_
int64_t
data_rem
;
u
_
int32_t
last_sent_nxt
;
u
_
int32_t
last_expected_una
;
u
_
int32_t
last_seen_una
;
}
__
packed
;
uint64_t
hdr_rem
;
uint64_t
data_rem
;
uint32_t
last_sent_nxt
;
uint32_t
last_expected_una
;
uint32_t
last_seen_una
;
}
__
attribute__
((
packed
))
;
#define RDS_IB_GID_LEN 16
struct
rds_info_rdma_connection
{
...
...
@@ -203,42 +199,69 @@ struct rds_info_rdma_connection {
* (so that the application does not have to worry about
* alignment).
*/
typedef
u
_
int64_t
rds_rdma_cookie_t
;
typedef
uint64_t
rds_rdma_cookie_t
;
struct
rds_iovec
{
u
_
int64_t
addr
;
u
_
int64_t
bytes
;
uint64_t
addr
;
uint64_t
bytes
;
};
struct
rds_get_mr_args
{
struct
rds_iovec
vec
;
u
_
int64_t
cookie_addr
;
uint64_t
cookie_addr
;
uint64_t
flags
;
};
struct
rds_get_mr_for_dest_args
{
struct
sockaddr_storage
dest_addr
;
struct
rds_iovec
vec
;
u
_
int64_t
cookie_addr
;
uint64_t
cookie_addr
;
uint64_t
flags
;
};
struct
rds_free_mr_args
{
rds_rdma_cookie_t
cookie
;
u
_
int64_t
flags
;
uint64_t
flags
;
};
struct
rds_rdma_args
{
rds_rdma_cookie_t
cookie
;
struct
rds_iovec
remote_vec
;
u_int64_t
local_vec_addr
;
u_int64_t
nr_local
;
u_int64_t
flags
;
u_int64_t
user_token
;
uint64_t
local_vec_addr
;
uint64_t
nr_local
;
uint64_t
flags
;
uint64_t
user_token
;
};
struct
rds_atomic_args
{
rds_rdma_cookie_t
cookie
;
uint64_t
local_addr
;
uint64_t
remote_addr
;
union
{
struct
{
uint64_t
compare
;
uint64_t
swap
;
}
cswp
;
struct
{
uint64_t
add
;
}
fadd
;
struct
{
uint64_t
compare
;
uint64_t
swap
;
uint64_t
compare_mask
;
uint64_t
swap_mask
;
}
m_cswp
;
struct
{
uint64_t
add
;
uint64_t
nocarry_mask
;
}
m_fadd
;
};
uint64_t
flags
;
uint64_t
user_token
;
};
struct
rds_rdma_notify
{
u
_
int64_t
user_token
;
uint64_t
user_token
;
int32_t
status
;
};
...
...
@@ -257,5 +280,6 @@ struct rds_rdma_notify {
#define RDS_RDMA_USE_ONCE 0x0008
/* free MR after use */
#define RDS_RDMA_DONTWAIT 0x0010
/* Don't wait in SET_BARRIER */
#define RDS_RDMA_NOTIFY_ME 0x0020
/* Notify when operation completes */
#define RDS_RDMA_SILENT 0x0040
/* Do not interrupt remote */
#endif
/* IB_RDS_H */
net/rds/af_rds.c
View file @
cf0ac2b8
...
...
@@ -39,7 +39,15 @@
#include <net/sock.h>
#include "rds.h"
#include "rdma.h"
char
*
rds_str_array
(
char
**
array
,
size_t
elements
,
size_t
index
)
{
if
((
index
<
elements
)
&&
array
[
index
])
return
array
[
index
];
else
return
"unknown"
;
}
EXPORT_SYMBOL
(
rds_str_array
);
/* this is just used for stats gathering :/ */
static
DEFINE_SPINLOCK
(
rds_sock_lock
);
...
...
@@ -62,7 +70,7 @@ static int rds_release(struct socket *sock)
struct
rds_sock
*
rs
;
unsigned
long
flags
;
if
(
sk
==
NULL
)
if
(
!
sk
)
goto
out
;
rs
=
rds_sk_to_rs
(
sk
);
...
...
@@ -73,7 +81,15 @@ static int rds_release(struct socket *sock)
* with the socket. */
rds_clear_recv_queue
(
rs
);
rds_cong_remove_socket
(
rs
);
/*
* the binding lookup hash uses rcu, we need to
* make sure we sychronize_rcu before we free our
* entry
*/
rds_remove_bound
(
rs
);
synchronize_rcu
();
rds_send_drop_to
(
rs
,
NULL
);
rds_rdma_drop_keys
(
rs
);
rds_notify_queue_get
(
rs
,
NULL
);
...
...
@@ -83,6 +99,8 @@ static int rds_release(struct socket *sock)
rds_sock_count
--
;
spin_unlock_irqrestore
(
&
rds_sock_lock
,
flags
);
rds_trans_put
(
rs
->
rs_transport
);
sock
->
sk
=
NULL
;
sock_put
(
sk
);
out:
...
...
@@ -514,7 +532,7 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
spin_unlock_irqrestore
(
&
rds_sock_lock
,
flags
);
}
static
void
__exit
rds_exit
(
void
)
static
void
rds_exit
(
void
)
{
sock_unregister
(
rds_family_ops
.
family
);
proto_unregister
(
&
rds_proto
);
...
...
@@ -529,7 +547,7 @@ static void __exit rds_exit(void)
}
module_exit
(
rds_exit
);
static
int
__init
rds_init
(
void
)
static
int
rds_init
(
void
)
{
int
ret
;
...
...
net/rds/bind.c
View file @
cf0ac2b8
...
...
@@ -34,45 +34,52 @@
#include <net/sock.h>
#include <linux/in.h>
#include <linux/if_arp.h>
#include <linux/jhash.h>
#include "rds.h"
/*
* XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
* particularly zippy.
*
* This is now called for every incoming frame so we arguably care much more
* about it than we used to.
*/
#define BIND_HASH_SIZE 1024
static
struct
hlist_head
bind_hash_table
[
BIND_HASH_SIZE
];
static
DEFINE_SPINLOCK
(
rds_bind_lock
);
static
struct
rb_root
rds_bind_tree
=
RB_ROOT
;
static
struct
rds_sock
*
rds_bind_tree_walk
(
__be32
addr
,
__be16
port
,
struct
rds_sock
*
insert
)
static
struct
hlist_head
*
hash_to_bucket
(
__be32
addr
,
__be16
port
)
{
return
bind_hash_table
+
(
jhash_2words
((
u32
)
addr
,
(
u32
)
port
,
0
)
&
(
BIND_HASH_SIZE
-
1
));
}
static
struct
rds_sock
*
rds_bind_lookup
(
__be32
addr
,
__be16
port
,
struct
rds_sock
*
insert
)
{
struct
rb_node
**
p
=
&
rds_bind_tree
.
rb_node
;
struct
rb_node
*
parent
=
NULL
;
struct
rds_sock
*
rs
;
struct
hlist_node
*
node
;
struct
hlist_head
*
head
=
hash_to_bucket
(
addr
,
port
);
u64
cmp
;
u64
needle
=
((
u64
)
be32_to_cpu
(
addr
)
<<
32
)
|
be16_to_cpu
(
port
);
while
(
*
p
)
{
parent
=
*
p
;
rs
=
rb_entry
(
parent
,
struct
rds_sock
,
rs_bound_node
);
rcu_read_lock
();
hlist_for_each_entry_rcu
(
rs
,
node
,
head
,
rs_bound_node
)
{
cmp
=
((
u64
)
be32_to_cpu
(
rs
->
rs_bound_addr
)
<<
32
)
|
be16_to_cpu
(
rs
->
rs_bound_port
);
if
(
needle
<
cmp
)
p
=
&
(
*
p
)
->
rb_left
;
else
if
(
needle
>
cmp
)
p
=
&
(
*
p
)
->
rb_right
;
else
if
(
cmp
==
needle
)
{
rcu_read_unlock
();
return
rs
;
}
}
rcu_read_unlock
();
if
(
insert
)
{
rb_link_node
(
&
insert
->
rs_bound_node
,
parent
,
p
);
rb_insert_color
(
&
insert
->
rs_bound_node
,
&
rds_bind_tree
);
/*
* make sure our addr and port are set before
* we are added to the list, other people
* in rcu will find us as soon as the
* hlist_add_head_rcu is done
*/
insert
->
rs_bound_addr
=
addr
;
insert
->
rs_bound_port
=
port
;
rds_sock_addref
(
insert
);
hlist_add_head_rcu
(
&
insert
->
rs_bound_node
,
head
);
}
return
NULL
;
}
...
...
@@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
struct
rds_sock
*
rds_find_bound
(
__be32
addr
,
__be16
port
)
{
struct
rds_sock
*
rs
;
unsigned
long
flags
;
spin_lock_irqsave
(
&
rds_bind_lock
,
flags
);
rs
=
rds_bind_tree_walk
(
addr
,
port
,
NULL
);
rs
=
rds_bind_lookup
(
addr
,
port
,
NULL
);
if
(
rs
&&
!
sock_flag
(
rds_rs_to_sk
(
rs
),
SOCK_DEAD
))
rds_sock_addref
(
rs
);
else
rs
=
NULL
;
spin_unlock_irqrestore
(
&
rds_bind_lock
,
flags
);
rdsdebug
(
"returning rs %p for %pI4:%u
\n
"
,
rs
,
&
addr
,
ntohs
(
port
));
...
...
@@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
do
{
if
(
rover
==
0
)
rover
++
;
if
(
rds_bind_tree_walk
(
addr
,
cpu_to_be16
(
rover
),
rs
)
==
NULL
)
{
*
port
=
cpu_to_be16
(
rover
)
;
if
(
!
rds_bind_lookup
(
addr
,
cpu_to_be16
(
rover
),
rs
)
)
{
*
port
=
rs
->
rs_bound_port
;
ret
=
0
;
rdsdebug
(
"rs %p binding to %pI4:%d
\n
"
,
rs
,
&
addr
,
(
int
)
ntohs
(
*
port
));
break
;
}
}
while
(
rover
++
!=
last
);
if
(
ret
==
0
)
{
rs
->
rs_bound_addr
=
addr
;
rs
->
rs_bound_port
=
*
port
;
rds_sock_addref
(
rs
);
rdsdebug
(
"rs %p binding to %pI4:%d
\n
"
,
rs
,
&
addr
,
(
int
)
ntohs
(
*
port
));
}
spin_unlock_irqrestore
(
&
rds_bind_lock
,
flags
);
return
ret
;
...
...
@@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs)
rs
,
&
rs
->
rs_bound_addr
,
ntohs
(
rs
->
rs_bound_port
));
rb_erase
(
&
rs
->
rs_bound_node
,
&
rds_bind_tre
e
);
hlist_del_init_rcu
(
&
rs
->
rs_bound_nod
e
);
rds_sock_put
(
rs
);
rs
->
rs_bound_addr
=
0
;
}
...
...
@@ -184,7 +182,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto
out
;
trans
=
rds_trans_get_preferred
(
sin
->
sin_addr
.
s_addr
);
if
(
trans
==
NULL
)
{
if
(
!
trans
)
{
ret
=
-
EADDRNOTAVAIL
;
rds_remove_bound
(
rs
);
if
(
printk_ratelimit
())
...
...
@@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
out:
release_sock
(
sk
);
/* we might have called rds_remove_bound on error */
if
(
ret
)
synchronize_rcu
();
return
ret
;
}
net/rds/cong.c
View file @
cf0ac2b8
...
...
@@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
unsigned
long
flags
;
map
=
kzalloc
(
sizeof
(
struct
rds_cong_map
),
GFP_KERNEL
);
if
(
map
==
NULL
)
if
(
!
map
)
return
NULL
;
map
->
m_addr
=
addr
;
...
...
@@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
ret
=
rds_cong_tree_walk
(
addr
,
map
);
spin_unlock_irqrestore
(
&
rds_cong_lock
,
flags
);
if
(
ret
==
NULL
)
{
if
(
!
ret
)
{
ret
=
map
;
map
=
NULL
;
}
...
...
@@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn)
conn
->
c_lcong
=
rds_cong_from_addr
(
conn
->
c_laddr
);
conn
->
c_fcong
=
rds_cong_from_addr
(
conn
->
c_faddr
);
if
(
conn
->
c_lcong
==
NULL
||
conn
->
c_fcong
==
NULL
)
if
(
!
(
conn
->
c_lcong
&&
conn
->
c_fcong
)
)
return
-
ENOMEM
;
return
0
;
...
...
@@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
list_for_each_entry
(
conn
,
&
map
->
m_conn_list
,
c_map_item
)
{
if
(
!
test_and_set_bit
(
0
,
&
conn
->
c_map_queued
))
{
rds_stats_inc
(
s_cong_update_queued
);
queue_delayed_work
(
rds_wq
,
&
conn
->
c_send_w
,
0
);
rds_send_xmit
(
conn
);
}
}
...
...
net/rds/connection.c
View file @
cf0ac2b8
...
...
@@ -37,7 +37,6 @@
#include "rds.h"
#include "loop.h"
#include "rdma.h"
#define RDS_CONNECTION_HASH_BITS 12
#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
...
...
@@ -63,18 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
} while (0)
static
inline
int
rds_conn_is_sending
(
struct
rds_connection
*
conn
)
{
int
ret
=
0
;
if
(
!
mutex_trylock
(
&
conn
->
c_send_lock
))
ret
=
1
;
else
mutex_unlock
(
&
conn
->
c_send_lock
);
return
ret
;
}
/* rcu read lock must be held or the connection spinlock */
static
struct
rds_connection
*
rds_conn_lookup
(
struct
hlist_head
*
head
,
__be32
laddr
,
__be32
faddr
,
struct
rds_transport
*
trans
)
...
...
@@ -82,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
struct
rds_connection
*
conn
,
*
ret
=
NULL
;
struct
hlist_node
*
pos
;
hlist_for_each_entry
(
conn
,
pos
,
head
,
c_hash_node
)
{
hlist_for_each_entry
_rcu
(
conn
,
pos
,
head
,
c_hash_node
)
{
if
(
conn
->
c_faddr
==
faddr
&&
conn
->
c_laddr
==
laddr
&&
conn
->
c_trans
==
trans
)
{
ret
=
conn
;
...
...
@@ -129,10 +117,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
{
struct
rds_connection
*
conn
,
*
parent
=
NULL
;
struct
hlist_head
*
head
=
rds_conn_bucket
(
laddr
,
faddr
);
struct
rds_transport
*
loop_trans
;
unsigned
long
flags
;
int
ret
;
spin_lock_irqsave
(
&
rds_conn_lock
,
flags
);
rcu_read_lock
(
);
conn
=
rds_conn_lookup
(
head
,
laddr
,
faddr
,
trans
);
if
(
conn
&&
conn
->
c_loopback
&&
conn
->
c_trans
!=
&
rds_loop_transport
&&
!
is_outgoing
)
{
...
...
@@ -143,12 +132,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
parent
=
conn
;
conn
=
parent
->
c_passive
;
}
spin_unlock_irqrestore
(
&
rds_conn_lock
,
flags
);
rcu_read_unlock
(
);
if
(
conn
)
goto
out
;
conn
=
kmem_cache_zalloc
(
rds_conn_slab
,
gfp
);
if
(
conn
==
NULL
)
{
if
(
!
conn
)
{
conn
=
ERR_PTR
(
-
ENOMEM
);
goto
out
;
}
...
...
@@ -159,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
spin_lock_init
(
&
conn
->
c_lock
);
conn
->
c_next_tx_seq
=
1
;
mutex_init
(
&
conn
->
c_send_lock
);
init_waitqueue_head
(
&
conn
->
c_waitq
);
INIT_LIST_HEAD
(
&
conn
->
c_send_queue
);
INIT_LIST_HEAD
(
&
conn
->
c_retrans
);
...
...
@@ -175,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
* can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport.
*/
if
(
rds_trans_get_preferred
(
faddr
))
{
loop_trans
=
rds_trans_get_preferred
(
faddr
);
if
(
loop_trans
)
{
rds_trans_put
(
loop_trans
);
conn
->
c_loopback
=
1
;
if
(
is_outgoing
&&
trans
->
t_prefer_loopback
)
{
/* "outgoing" connection - and the transport
...
...
@@ -238,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
kmem_cache_free
(
rds_conn_slab
,
conn
);
conn
=
found
;
}
else
{
hlist_add_head
(
&
conn
->
c_hash_node
,
head
);
hlist_add_head
_rcu
(
&
conn
->
c_hash_node
,
head
);
rds_cong_add_conn
(
conn
);
rds_conn_count
++
;
}
...
...
@@ -263,21 +254,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
}
EXPORT_SYMBOL_GPL
(
rds_conn_create_outgoing
);
void
rds_conn_shutdown
(
struct
rds_connection
*
conn
)
{
/* shut it down unless it's down already */
if
(
!
rds_conn_transition
(
conn
,
RDS_CONN_DOWN
,
RDS_CONN_DOWN
))
{
/*
* Quiesce the connection mgmt handlers before we start tearing
* things down. We don't hold the mutex for the entire
* duration of the shutdown operation, else we may be
* deadlocking with the CM handler. Instead, the CM event
* handler is supposed to check for state DISCONNECTING
*/
mutex_lock
(
&
conn
->
c_cm_lock
);
if
(
!
rds_conn_transition
(
conn
,
RDS_CONN_UP
,
RDS_CONN_DISCONNECTING
)
&&
!
rds_conn_transition
(
conn
,
RDS_CONN_ERROR
,
RDS_CONN_DISCONNECTING
))
{
rds_conn_error
(
conn
,
"shutdown called in state %d
\n
"
,
atomic_read
(
&
conn
->
c_state
));
mutex_unlock
(
&
conn
->
c_cm_lock
);
return
;
}
mutex_unlock
(
&
conn
->
c_cm_lock
);
wait_event
(
conn
->
c_waitq
,
!
test_bit
(
RDS_IN_XMIT
,
&
conn
->
c_flags
));
conn
->
c_trans
->
conn_shutdown
(
conn
);
rds_conn_reset
(
conn
);
if
(
!
rds_conn_transition
(
conn
,
RDS_CONN_DISCONNECTING
,
RDS_CONN_DOWN
))
{
/* This can happen - eg when we're in the middle of tearing
* down the connection, and someone unloads the rds module.
* Quite reproduceable with loopback connections.
* Mostly harmless.
*/
rds_conn_error
(
conn
,
"%s: failed to transition to state DOWN, "
"current state is %d
\n
"
,
__func__
,
atomic_read
(
&
conn
->
c_state
));
return
;
}
}
/* Then reconnect if it's still live.
* The passive side of an IB loopback connection is never added
* to the conn hash, so we never trigger a reconnect on this
* conn - the reconnect is always triggered by the active peer. */
cancel_delayed_work_sync
(
&
conn
->
c_conn_w
);
rcu_read_lock
();
if
(
!
hlist_unhashed
(
&
conn
->
c_hash_node
))
{
rcu_read_unlock
();
rds_queue_reconnect
(
conn
);
}
else
{
rcu_read_unlock
();
}
}
/*
* Stop and free a connection.
*
* This can only be used in very limited circumstances. It assumes that once
* the conn has been shutdown that no one else is referencing the connection.
* We can only ensure this in the rmmod path in the current code.
*/
void
rds_conn_destroy
(
struct
rds_connection
*
conn
)
{
struct
rds_message
*
rm
,
*
rtmp
;
unsigned
long
flags
;
rdsdebug
(
"freeing conn %p for %pI4 -> "
"%pI4
\n
"
,
conn
,
&
conn
->
c_laddr
,
&
conn
->
c_faddr
);
hlist_del_init
(
&
conn
->
c_hash_node
);
/* Ensure conn will not be scheduled for reconnect */
spin_lock_irq
(
&
rds_conn_lock
);
hlist_del_init_rcu
(
&
conn
->
c_hash_node
);
spin_unlock_irq
(
&
rds_conn_lock
);
synchronize_rcu
();
/* wait for the rds thread to shut it down */
atomic_set
(
&
conn
->
c_state
,
RDS_CONN_ERROR
);
cancel_delayed_work
(
&
conn
->
c_conn_w
);
queue_work
(
rds_wq
,
&
conn
->
c_down_w
);
flush_workqueue
(
rds_wq
);
/* shut the connection down */
rds_conn_drop
(
conn
);
flush_work
(
&
conn
->
c_down_w
);
/* make sure lingering queued work won't try to ref the conn */
cancel_delayed_work_sync
(
&
conn
->
c_send_w
);
cancel_delayed_work_sync
(
&
conn
->
c_recv_w
);
/* tear down queued messages */
list_for_each_entry_safe
(
rm
,
rtmp
,
...
...
@@ -302,7 +363,9 @@ void rds_conn_destroy(struct rds_connection *conn)
BUG_ON
(
!
list_empty
(
&
conn
->
c_retrans
));
kmem_cache_free
(
rds_conn_slab
,
conn
);
spin_lock_irqsave
(
&
rds_conn_lock
,
flags
);
rds_conn_count
--
;
spin_unlock_irqrestore
(
&
rds_conn_lock
,
flags
);
}
EXPORT_SYMBOL_GPL
(
rds_conn_destroy
);
...
...
@@ -316,23 +379,23 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
struct
list_head
*
list
;
struct
rds_connection
*
conn
;
struct
rds_message
*
rm
;
unsigned
long
flags
;
unsigned
int
total
=
0
;
unsigned
long
flags
;
size_t
i
;
len
/=
sizeof
(
struct
rds_info_message
);
spin_lock_irqsave
(
&
rds_conn_lock
,
flags
);
rcu_read_lock
(
);
for
(
i
=
0
,
head
=
rds_conn_hash
;
i
<
ARRAY_SIZE
(
rds_conn_hash
);
i
++
,
head
++
)
{
hlist_for_each_entry
(
conn
,
pos
,
head
,
c_hash_node
)
{
hlist_for_each_entry
_rcu
(
conn
,
pos
,
head
,
c_hash_node
)
{
if
(
want_send
)
list
=
&
conn
->
c_send_queue
;
else
list
=
&
conn
->
c_retrans
;
spin_lock
(
&
conn
->
c_lock
);
spin_lock
_irqsave
(
&
conn
->
c_lock
,
flags
);
/* XXX too lazy to maintain counts.. */
list_for_each_entry
(
rm
,
list
,
m_conn_item
)
{
...
...
@@ -343,11 +406,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
conn
->
c_faddr
,
0
);
}
spin_unlock
(
&
conn
->
c_lock
);
spin_unlock
_irqrestore
(
&
conn
->
c_lock
,
flags
);
}
}
spin_unlock_irqrestore
(
&
rds_conn_lock
,
flags
);
rcu_read_unlock
();
lens
->
nr
=
total
;
lens
->
each
=
sizeof
(
struct
rds_info_message
);
...
...
@@ -377,19 +439,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
uint64_t
buffer
[(
item_len
+
7
)
/
8
];
struct
hlist_head
*
head
;
struct
hlist_node
*
pos
;
struct
hlist_node
*
tmp
;
struct
rds_connection
*
conn
;
unsigned
long
flags
;
size_t
i
;
spin_lock_irqsave
(
&
rds_conn_lock
,
flags
);
rcu_read_lock
(
);
lens
->
nr
=
0
;
lens
->
each
=
item_len
;
for
(
i
=
0
,
head
=
rds_conn_hash
;
i
<
ARRAY_SIZE
(
rds_conn_hash
);
i
++
,
head
++
)
{
hlist_for_each_entry_
safe
(
conn
,
pos
,
tmp
,
head
,
c_hash_node
)
{
hlist_for_each_entry_
rcu
(
conn
,
pos
,
head
,
c_hash_node
)
{
/* XXX no c_lock usage.. */
if
(
!
visitor
(
conn
,
buffer
))
...
...
@@ -405,8 +465,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
lens
->
nr
++
;
}
}
spin_unlock_irqrestore
(
&
rds_conn_lock
,
flags
);
rcu_read_unlock
();
}
EXPORT_SYMBOL_GPL
(
rds_for_each_conn_info
);
...
...
@@ -423,8 +482,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn,
sizeof
(
cinfo
->
transport
));
cinfo
->
flags
=
0
;
rds_conn_info_set
(
cinfo
->
flags
,
rds_conn_is_sending
(
conn
),
SENDING
);
rds_conn_info_set
(
cinfo
->
flags
,
test_bit
(
RDS_IN_XMIT
,
&
conn
->
c_flags
),
SENDING
);
/* XXX Future: return the state rather than these funky bits */
rds_conn_info_set
(
cinfo
->
flags
,
atomic_read
(
&
conn
->
c_state
)
==
RDS_CONN_CONNECTING
,
...
...
@@ -444,12 +503,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
sizeof
(
struct
rds_info_connection
));
}
int
__init
rds_conn_init
(
void
)
int
rds_conn_init
(
void
)
{
rds_conn_slab
=
kmem_cache_create
(
"rds_connection"
,
sizeof
(
struct
rds_connection
),
0
,
0
,
NULL
);
if
(
rds_conn_slab
==
NULL
)
if
(
!
rds_conn_slab
)
return
-
ENOMEM
;
rds_info_register_func
(
RDS_INFO_CONNECTIONS
,
rds_conn_info
);
...
...
@@ -486,6 +545,18 @@ void rds_conn_drop(struct rds_connection *conn)
}
EXPORT_SYMBOL_GPL
(
rds_conn_drop
);
/*
* If the connection is down, trigger a connect. We may have scheduled a
* delayed reconnect however - in this case we should not interfere.
*/
void
rds_conn_connect_if_down
(
struct
rds_connection
*
conn
)
{
if
(
rds_conn_state
(
conn
)
==
RDS_CONN_DOWN
&&
!
test_and_set_bit
(
RDS_RECONNECT_PENDING
,
&
conn
->
c_flags
))
queue_delayed_work
(
rds_wq
,
&
conn
->
c_conn_w
,
0
);
}
EXPORT_SYMBOL_GPL
(
rds_conn_connect_if_down
);
/*
* An error occurred on the connection
*/
...
...
net/rds/ib.c
View file @
cf0ac2b8
...
...
@@ -53,12 +53,71 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
module_param
(
rds_ib_retry_count
,
int
,
0444
);
MODULE_PARM_DESC
(
rds_ib_retry_count
,
" Number of hw retries before reporting an error"
);
/*
* we have a clumsy combination of RCU and a rwsem protecting this list
* because it is used both in the get_mr fast path and while blocking in
* the FMR flushing path.
*/
DECLARE_RWSEM
(
rds_ib_devices_lock
);
struct
list_head
rds_ib_devices
;
/* NOTE: if also grabbing ibdev lock, grab this first */
DEFINE_SPINLOCK
(
ib_nodev_conns_lock
);
LIST_HEAD
(
ib_nodev_conns
);
void
rds_ib_nodev_connect
(
void
)
{
struct
rds_ib_connection
*
ic
;
spin_lock
(
&
ib_nodev_conns_lock
);
list_for_each_entry
(
ic
,
&
ib_nodev_conns
,
ib_node
)
rds_conn_connect_if_down
(
ic
->
conn
);
spin_unlock
(
&
ib_nodev_conns_lock
);
}
void
rds_ib_dev_shutdown
(
struct
rds_ib_device
*
rds_ibdev
)
{
struct
rds_ib_connection
*
ic
;
unsigned
long
flags
;
spin_lock_irqsave
(
&
rds_ibdev
->
spinlock
,
flags
);
list_for_each_entry
(
ic
,
&
rds_ibdev
->
conn_list
,
ib_node
)
rds_conn_drop
(
ic
->
conn
);
spin_unlock_irqrestore
(
&
rds_ibdev
->
spinlock
,
flags
);
}
/*
* rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
* from interrupt context so we push freing off into a work struct in krdsd.
*/
static
void
rds_ib_dev_free
(
struct
work_struct
*
work
)
{
struct
rds_ib_ipaddr
*
i_ipaddr
,
*
i_next
;
struct
rds_ib_device
*
rds_ibdev
=
container_of
(
work
,
struct
rds_ib_device
,
free_work
);
if
(
rds_ibdev
->
mr_pool
)
rds_ib_destroy_mr_pool
(
rds_ibdev
->
mr_pool
);
if
(
rds_ibdev
->
mr
)
ib_dereg_mr
(
rds_ibdev
->
mr
);
if
(
rds_ibdev
->
pd
)
ib_dealloc_pd
(
rds_ibdev
->
pd
);
list_for_each_entry_safe
(
i_ipaddr
,
i_next
,
&
rds_ibdev
->
ipaddr_list
,
list
)
{
list_del
(
&
i_ipaddr
->
list
);
kfree
(
i_ipaddr
);
}
kfree
(
rds_ibdev
);
}
void
rds_ib_dev_put
(
struct
rds_ib_device
*
rds_ibdev
)
{
BUG_ON
(
atomic_read
(
&
rds_ibdev
->
refcount
)
<=
0
);
if
(
atomic_dec_and_test
(
&
rds_ibdev
->
refcount
))
queue_work
(
rds_wq
,
&
rds_ibdev
->
free_work
);
}
void
rds_ib_add_one
(
struct
ib_device
*
device
)
{
struct
rds_ib_device
*
rds_ibdev
;
...
...
@@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device)
goto
free_attr
;
}
rds_ibdev
=
kmalloc
(
sizeof
*
rds_ibdev
,
GFP_KERNEL
);
rds_ibdev
=
kzalloc_node
(
sizeof
(
struct
rds_ib_device
),
GFP_KERNEL
,
ibdev_to_node
(
device
));
if
(
!
rds_ibdev
)
goto
free_attr
;
spin_lock_init
(
&
rds_ibdev
->
spinlock
);
atomic_set
(
&
rds_ibdev
->
refcount
,
1
);
INIT_WORK
(
&
rds_ibdev
->
free_work
,
rds_ib_dev_free
);
rds_ibdev
->
max_wrs
=
dev_attr
->
max_qp_wr
;
rds_ibdev
->
max_sge
=
min
(
dev_attr
->
max_sge
,
RDS_IB_MAX_SGE
);
...
...
@@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device)
min_t
(
unsigned
int
,
dev_attr
->
max_fmr
,
fmr_pool_size
)
:
fmr_pool_size
;
rds_ibdev
->
max_initiator_depth
=
dev_attr
->
max_qp_init_rd_atom
;
rds_ibdev
->
max_responder_resources
=
dev_attr
->
max_qp_rd_atom
;
rds_ibdev
->
dev
=
device
;
rds_ibdev
->
pd
=
ib_alloc_pd
(
device
);
if
(
IS_ERR
(
rds_ibdev
->
pd
))
goto
free_dev
;
if
(
IS_ERR
(
rds_ibdev
->
pd
))
{
rds_ibdev
->
pd
=
NULL
;
goto
put_dev
;
}
rds_ibdev
->
mr
=
ib_get_dma_mr
(
rds_ibdev
->
pd
,
IB_ACCESS_LOCAL_WRITE
);
if
(
IS_ERR
(
rds_ibdev
->
mr
))
goto
err_pd
;
rds_ibdev
->
mr
=
ib_get_dma_mr
(
rds_ibdev
->
pd
,
IB_ACCESS_LOCAL_WRITE
);
if
(
IS_ERR
(
rds_ibdev
->
mr
))
{
rds_ibdev
->
mr
=
NULL
;
goto
put_dev
;
}
rds_ibdev
->
mr_pool
=
rds_ib_create_mr_pool
(
rds_ibdev
);
if
(
IS_ERR
(
rds_ibdev
->
mr_pool
))
{
rds_ibdev
->
mr_pool
=
NULL
;
goto
err_mr
;
goto
put_dev
;
}
INIT_LIST_HEAD
(
&
rds_ibdev
->
ipaddr_list
);
INIT_LIST_HEAD
(
&
rds_ibdev
->
conn_list
);
list_add_tail
(
&
rds_ibdev
->
list
,
&
rds_ib_devices
);
down_write
(
&
rds_ib_devices_lock
);
list_add_tail_rcu
(
&
rds_ibdev
->
list
,
&
rds_ib_devices
);
up_write
(
&
rds_ib_devices_lock
);
atomic_inc
(
&
rds_ibdev
->
refcount
);
ib_set_client_data
(
device
,
&
rds_ib_client
,
rds_ibdev
);
atomic_inc
(
&
rds_ibdev
->
refcount
);
goto
free_attr
;
rds_ib_nodev_connect
()
;
err_mr:
ib_dereg_mr
(
rds_ibdev
->
mr
);
err_pd:
ib_dealloc_pd
(
rds_ibdev
->
pd
);
free_dev:
kfree
(
rds_ibdev
);
put_dev:
rds_ib_dev_put
(
rds_ibdev
);
free_attr:
kfree
(
dev_attr
);
}
/*
* New connections use this to find the device to associate with the
* connection. It's not in the fast path so we're not concerned about the
* performance of the IB call. (As of this writing, it uses an interrupt
* blocking spinlock to serialize walking a per-device list of all registered
* clients.)
*
* RCU is used to handle incoming connections racing with device teardown.
* Rather than use a lock to serialize removal from the client_data and
* getting a new reference, we use an RCU grace period. The destruction
* path removes the device from client_data and then waits for all RCU
* readers to finish.
*
* A new connection can get NULL from this if its arriving on a
* device that is in the process of being removed.
*/
struct
rds_ib_device
*
rds_ib_get_client_data
(
struct
ib_device
*
device
)
{
struct
rds_ib_device
*
rds_ibdev
;
rcu_read_lock
();
rds_ibdev
=
ib_get_client_data
(
device
,
&
rds_ib_client
);
if
(
rds_ibdev
)
atomic_inc
(
&
rds_ibdev
->
refcount
);
rcu_read_unlock
();
return
rds_ibdev
;
}
/*
* The IB stack is letting us know that a device is going away. This can
* happen if the underlying HCA driver is removed or if PCI hotplug is removing
* the pci function, for example.
*
* This can be called at any time and can be racing with any other RDS path.
*/
void
rds_ib_remove_one
(
struct
ib_device
*
device
)
{
struct
rds_ib_device
*
rds_ibdev
;
struct
rds_ib_ipaddr
*
i_ipaddr
,
*
i_next
;
rds_ibdev
=
ib_get_client_data
(
device
,
&
rds_ib_client
);
if
(
!
rds_ibdev
)
return
;
list_for_each_entry_safe
(
i_ipaddr
,
i_next
,
&
rds_ibdev
->
ipaddr_list
,
list
)
{
list_del
(
&
i_ipaddr
->
list
);
kfree
(
i_ipaddr
);
}
rds_ib_dev_shutdown
(
rds_ibdev
);
rds_ib_destroy_conns
(
rds_ibdev
);
/* stop connection attempts from getting a reference to this device. */
ib_set_client_data
(
device
,
&
rds_ib_client
,
NULL
);
if
(
rds_ibdev
->
mr_pool
)
rds_ib_destroy_mr_pool
(
rds_ibdev
->
mr_pool
);
ib_dereg_mr
(
rds_ibdev
->
mr
);
while
(
ib_dealloc_pd
(
rds_ibdev
->
pd
))
{
rdsdebug
(
"Failed to dealloc pd %p
\n
"
,
rds_ibdev
->
pd
);
msleep
(
1
);
}
down_write
(
&
rds_ib_devices_lock
);
list_del_rcu
(
&
rds_ibdev
->
list
);
up_write
(
&
rds_ib_devices_lock
);
list_del
(
&
rds_ibdev
->
list
);
kfree
(
rds_ibdev
);
/*
* This synchronize rcu is waiting for readers of both the ib
* client data and the devices list to finish before we drop
* both of those references.
*/
synchronize_rcu
();
rds_ib_dev_put
(
rds_ibdev
);
rds_ib_dev_put
(
rds_ibdev
);
}
struct
ib_client
rds_ib_client
=
{
...
...
@@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
rdma_addr_get_sgid
(
dev_addr
,
(
union
ib_gid
*
)
&
iinfo
->
src_gid
);
rdma_addr_get_dgid
(
dev_addr
,
(
union
ib_gid
*
)
&
iinfo
->
dst_gid
);
rds_ibdev
=
i
b_get_client_data
(
ic
->
i_cm_id
->
device
,
&
rds_ib_client
)
;
rds_ibdev
=
i
c
->
rds_ibdev
;
iinfo
->
max_send_wr
=
ic
->
i_send_ring
.
w_nr
;
iinfo
->
max_recv_wr
=
ic
->
i_recv_ring
.
w_nr
;
iinfo
->
max_send_sge
=
rds_ibdev
->
max_sge
;
...
...
@@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr)
return
ret
;
}
static
void
rds_ib_unregister_client
(
void
)
{
ib_unregister_client
(
&
rds_ib_client
);
/* wait for rds_ib_dev_free() to complete */
flush_workqueue
(
rds_wq
);
}
void
rds_ib_exit
(
void
)
{
rds_info_deregister_func
(
RDS_INFO_IB_CONNECTIONS
,
rds_ib_ic_info
);
rds_ib_unregister_client
();
rds_ib_destroy_nodev_conns
();
ib_unregister_client
(
&
rds_ib_client
);
rds_ib_sysctl_exit
();
rds_ib_recv_exit
();
rds_trans_unregister
(
&
rds_ib_transport
);
rds_ib_fmr_exit
();
}
struct
rds_transport
rds_ib_transport
=
{
.
laddr_check
=
rds_ib_laddr_check
,
.
xmit_complete
=
rds_ib_xmit_complete
,
.
xmit
=
rds_ib_xmit
,
.
xmit_cong_map
=
NULL
,
.
xmit_rdma
=
rds_ib_xmit_rdma
,
.
xmit_atomic
=
rds_ib_xmit_atomic
,
.
recv
=
rds_ib_recv
,
.
conn_alloc
=
rds_ib_conn_alloc
,
.
conn_free
=
rds_ib_conn_free
,
.
conn_connect
=
rds_ib_conn_connect
,
.
conn_shutdown
=
rds_ib_conn_shutdown
,
.
inc_copy_to_user
=
rds_ib_inc_copy_to_user
,
.
inc_purge
=
rds_ib_inc_purge
,
.
inc_free
=
rds_ib_inc_free
,
.
cm_initiate_connect
=
rds_ib_cm_initiate_connect
,
.
cm_handle_connect
=
rds_ib_cm_handle_connect
,
...
...
@@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = {
.
t_type
=
RDS_TRANS_IB
};
int
__init
rds_ib_init
(
void
)
int
rds_ib_init
(
void
)
{
int
ret
;
INIT_LIST_HEAD
(
&
rds_ib_devices
);
ret
=
ib_register_client
(
&
rds_ib_client
);
ret
=
rds_ib_fmr_init
(
);
if
(
ret
)
goto
out
;
ret
=
ib_register_client
(
&
rds_ib_client
);
if
(
ret
)
goto
out_fmr_exit
;
ret
=
rds_ib_sysctl_init
();
if
(
ret
)
goto
out_ibreg
;
...
...
@@ -317,7 +429,9 @@ int __init rds_ib_init(void)
out_sysctl:
rds_ib_sysctl_exit
();
out_ibreg:
ib_unregister_client
(
&
rds_ib_client
);
rds_ib_unregister_client
();
out_fmr_exit:
rds_ib_fmr_exit
();
out:
return
ret
;
}
...
...
net/rds/ib.h
View file @
cf0ac2b8
...
...
@@ -3,11 +3,13 @@
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include "rds.h"
#include "rdma_transport.h"
#define RDS_FMR_SIZE 256
#define RDS_FMR_POOL_SIZE
4096
#define RDS_FMR_POOL_SIZE
8192
#define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2
...
...
@@ -19,6 +21,9 @@
#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003
/* minor versions supported */
#define RDS_IB_RECYCLE_BATCH_COUNT 32
extern
struct
rw_semaphore
rds_ib_devices_lock
;
extern
struct
list_head
rds_ib_devices
;
/*
...
...
@@ -26,20 +31,29 @@ extern struct list_head rds_ib_devices;
* try and minimize the amount of memory tied up both the device and
* socket receive queues.
*/
/* page offset of the final full frag that fits in the page */
#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
struct
rds_page_frag
{
struct
list_head
f_item
;
struct
page
*
f_page
;
unsigned
long
f_offset
;
dma_addr_t
f_mapped
;
struct
list_head
f_cache_entry
;
struct
scatterlist
f_sg
;
};
struct
rds_ib_incoming
{
struct
list_head
ii_frags
;
struct
list_head
ii_cache_entry
;
struct
rds_incoming
ii_inc
;
};
struct
rds_ib_cache_head
{
struct
list_head
*
first
;
unsigned
long
count
;
};
struct
rds_ib_refill_cache
{
struct
rds_ib_cache_head
*
percpu
;
struct
list_head
*
xfer
;
struct
list_head
*
ready
;
};
struct
rds_ib_connect_private
{
/* Add new fields at the end, and don't permute existing fields. */
__be32
dp_saddr
;
...
...
@@ -53,8 +67,7 @@ struct rds_ib_connect_private {
};
struct
rds_ib_send_work
{
struct
rds_message
*
s_rm
;
struct
rds_rdma_op
*
s_op
;
void
*
s_op
;
struct
ib_send_wr
s_wr
;
struct
ib_sge
s_sge
[
RDS_IB_MAX_SGE
];
unsigned
long
s_queued
;
...
...
@@ -92,10 +105,11 @@ struct rds_ib_connection {
/* tx */
struct
rds_ib_work_ring
i_send_ring
;
struct
r
ds_message
*
i_rm
;
struct
r
m_data_op
*
i_data_op
;
struct
rds_header
*
i_send_hdrs
;
u64
i_send_hdrs_dma
;
struct
rds_ib_send_work
*
i_sends
;
atomic_t
i_signaled_sends
;
/* rx */
struct
tasklet_struct
i_recv_tasklet
;
...
...
@@ -106,8 +120,9 @@ struct rds_ib_connection {
struct
rds_header
*
i_recv_hdrs
;
u64
i_recv_hdrs_dma
;
struct
rds_ib_recv_work
*
i_recvs
;
struct
rds_page_frag
i_frag
;
u64
i_ack_recv
;
/* last ACK received */
struct
rds_ib_refill_cache
i_cache_incs
;
struct
rds_ib_refill_cache
i_cache_frags
;
/* sending acks */
unsigned
long
i_ack_flags
;
...
...
@@ -138,7 +153,6 @@ struct rds_ib_connection {
/* Batched completions */
unsigned
int
i_unsignaled_wrs
;
long
i_unsignaled_bytes
;
};
/* This assumes that atomic_t is at least 32 bits */
...
...
@@ -164,9 +178,17 @@ struct rds_ib_device {
unsigned
int
max_fmrs
;
int
max_sge
;
unsigned
int
max_wrs
;
unsigned
int
max_initiator_depth
;
unsigned
int
max_responder_resources
;
spinlock_t
spinlock
;
/* protect the above */
atomic_t
refcount
;
struct
work_struct
free_work
;
};
#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus)
#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device))
#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
/* bits for i_ack_flags */
#define IB_ACK_IN_FLIGHT 0
#define IB_ACK_REQUESTED 1
...
...
@@ -202,6 +224,8 @@ struct rds_ib_statistics {
uint64_t
s_ib_rdma_mr_pool_flush
;
uint64_t
s_ib_rdma_mr_pool_wait
;
uint64_t
s_ib_rdma_mr_pool_depleted
;
uint64_t
s_ib_atomic_cswp
;
uint64_t
s_ib_atomic_fadd
;
};
extern
struct
workqueue_struct
*
rds_ib_wq
;
...
...
@@ -243,6 +267,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
extern
struct
rds_transport
rds_ib_transport
;
extern
void
rds_ib_add_one
(
struct
ib_device
*
device
);
extern
void
rds_ib_remove_one
(
struct
ib_device
*
device
);
struct
rds_ib_device
*
rds_ib_get_client_data
(
struct
ib_device
*
device
);
void
rds_ib_dev_put
(
struct
rds_ib_device
*
rds_ibdev
);
extern
struct
ib_client
rds_ib_client
;
extern
unsigned
int
fmr_pool_size
;
...
...
@@ -258,7 +284,7 @@ void rds_ib_conn_free(void *arg);
int
rds_ib_conn_connect
(
struct
rds_connection
*
conn
);
void
rds_ib_conn_shutdown
(
struct
rds_connection
*
conn
);
void
rds_ib_state_change
(
struct
sock
*
sk
);
int
__init
rds_ib_listen_init
(
void
);
int
rds_ib_listen_init
(
void
);
void
rds_ib_listen_stop
(
void
);
void
__rds_ib_conn_error
(
struct
rds_connection
*
conn
,
const
char
*
,
...);
int
rds_ib_cm_handle_connect
(
struct
rdma_cm_id
*
cm_id
,
...
...
@@ -275,15 +301,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
int
rds_ib_update_ipaddr
(
struct
rds_ib_device
*
rds_ibdev
,
__be32
ipaddr
);
void
rds_ib_add_conn
(
struct
rds_ib_device
*
rds_ibdev
,
struct
rds_connection
*
conn
);
void
rds_ib_remove_conn
(
struct
rds_ib_device
*
rds_ibdev
,
struct
rds_connection
*
conn
);
void
__rds_ib_destroy_conns
(
struct
list_head
*
list
,
spinlock_t
*
list_lock
);
static
inline
void
rds_ib_destroy_nodev_conns
(
void
)
{
__rds_ib_destroy_conns
(
&
ib_nodev_conns
,
&
ib_nodev_conns_lock
);
}
static
inline
void
rds_ib_destroy_conns
(
struct
rds_ib_device
*
rds_ibdev
)
{
__rds_ib_destroy_conns
(
&
rds_ibdev
->
conn_list
,
&
rds_ibdev
->
spinlock
);
}
void
rds_ib_destroy_nodev_conns
(
void
);
struct
rds_ib_mr_pool
*
rds_ib_create_mr_pool
(
struct
rds_ib_device
*
);
void
rds_ib_get_mr_info
(
struct
rds_ib_device
*
rds_ibdev
,
struct
rds_info_rdma_connection
*
iinfo
);
void
rds_ib_destroy_mr_pool
(
struct
rds_ib_mr_pool
*
);
...
...
@@ -292,14 +310,16 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
void
rds_ib_sync_mr
(
void
*
trans_private
,
int
dir
);
void
rds_ib_free_mr
(
void
*
trans_private
,
int
invalidate
);
void
rds_ib_flush_mrs
(
void
);
int
rds_ib_fmr_init
(
void
);
void
rds_ib_fmr_exit
(
void
);
/* ib_recv.c */
int
__init
rds_ib_recv_init
(
void
);
int
rds_ib_recv_init
(
void
);
void
rds_ib_recv_exit
(
void
);
int
rds_ib_recv
(
struct
rds_connection
*
conn
);
int
rds_ib_recv_
refill
(
struct
rds_connection
*
conn
,
gfp_t
kptr_gfp
,
gfp_t
page_gfp
,
int
prefill
);
void
rds_ib_
inc_purge
(
struct
rds_incoming
*
inc
);
int
rds_ib_recv_
alloc_caches
(
struct
rds_ib_connection
*
ic
);
void
rds_ib_recv_free_caches
(
struct
rds_ib_connection
*
ic
);
void
rds_ib_
recv_refill
(
struct
rds_connection
*
conn
,
int
prefill
);
void
rds_ib_inc_free
(
struct
rds_incoming
*
inc
);
int
rds_ib_inc_copy_to_user
(
struct
rds_incoming
*
inc
,
struct
iovec
*
iov
,
size_t
size
);
...
...
@@ -325,17 +345,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
extern
wait_queue_head_t
rds_ib_ring_empty_wait
;
/* ib_send.c */
char
*
rds_ib_wc_status_str
(
enum
ib_wc_status
status
);
void
rds_ib_xmit_complete
(
struct
rds_connection
*
conn
);
int
rds_ib_xmit
(
struct
rds_connection
*
conn
,
struct
rds_message
*
rm
,
unsigned
int
hdr_off
,
unsigned
int
sg
,
unsigned
int
off
);
void
rds_ib_send_cq_comp_handler
(
struct
ib_cq
*
cq
,
void
*
context
);
void
rds_ib_send_init_ring
(
struct
rds_ib_connection
*
ic
);
void
rds_ib_send_clear_ring
(
struct
rds_ib_connection
*
ic
);
int
rds_ib_xmit_rdma
(
struct
rds_connection
*
conn
,
struct
r
ds
_rdma_op
*
op
);
int
rds_ib_xmit_rdma
(
struct
rds_connection
*
conn
,
struct
r
m
_rdma_op
*
op
);
void
rds_ib_send_add_credits
(
struct
rds_connection
*
conn
,
unsigned
int
credits
);
void
rds_ib_advertise_credits
(
struct
rds_connection
*
conn
,
unsigned
int
posted
);
int
rds_ib_send_grab_credits
(
struct
rds_ib_connection
*
ic
,
u32
wanted
,
u32
*
adv_credits
,
int
need_posted
,
int
max_posted
);
int
rds_ib_xmit_atomic
(
struct
rds_connection
*
conn
,
struct
rm_atomic_op
*
op
);
/* ib_stats.c */
DECLARE_PER_CPU
(
struct
rds_ib_statistics
,
rds_ib_stats
);
...
...
@@ -344,7 +366,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
unsigned
int
avail
);
/* ib_sysctl.c */
int
__init
rds_ib_sysctl_init
(
void
);
int
rds_ib_sysctl_init
(
void
);
void
rds_ib_sysctl_exit
(
void
);
extern
unsigned
long
rds_ib_sysctl_max_send_wr
;
extern
unsigned
long
rds_ib_sysctl_max_recv_wr
;
...
...
@@ -354,28 +376,4 @@ extern unsigned long rds_ib_sysctl_max_recv_allocation;
extern
unsigned
int
rds_ib_sysctl_flow_control
;
extern
ctl_table
rds_ib_sysctl_table
[];
/*
* Helper functions for getting/setting the header and data SGEs in
* RDS packets (not RDMA)
*
* From version 3.1 onwards, header is in front of data in the sge.
*/
static
inline
struct
ib_sge
*
rds_ib_header_sge
(
struct
rds_ib_connection
*
ic
,
struct
ib_sge
*
sge
)
{
if
(
ic
->
conn
->
c_version
>
RDS_PROTOCOL_3_0
)
return
&
sge
[
0
];
else
return
&
sge
[
1
];
}
static
inline
struct
ib_sge
*
rds_ib_data_sge
(
struct
rds_ib_connection
*
ic
,
struct
ib_sge
*
sge
)
{
if
(
ic
->
conn
->
c_version
>
RDS_PROTOCOL_3_0
)
return
&
sge
[
1
];
else
return
&
sge
[
0
];
}
#endif
net/rds/ib_cm.c
View file @
cf0ac2b8
...
...
@@ -38,6 +38,36 @@
#include "rds.h"
#include "ib.h"
static
char
*
rds_ib_event_type_strings
[]
=
{
#define RDS_IB_EVENT_STRING(foo) \
[IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
RDS_IB_EVENT_STRING
(
CQ_ERR
),
RDS_IB_EVENT_STRING
(
QP_FATAL
),
RDS_IB_EVENT_STRING
(
QP_REQ_ERR
),
RDS_IB_EVENT_STRING
(
QP_ACCESS_ERR
),
RDS_IB_EVENT_STRING
(
COMM_EST
),
RDS_IB_EVENT_STRING
(
SQ_DRAINED
),
RDS_IB_EVENT_STRING
(
PATH_MIG
),
RDS_IB_EVENT_STRING
(
PATH_MIG_ERR
),
RDS_IB_EVENT_STRING
(
DEVICE_FATAL
),
RDS_IB_EVENT_STRING
(
PORT_ACTIVE
),
RDS_IB_EVENT_STRING
(
PORT_ERR
),
RDS_IB_EVENT_STRING
(
LID_CHANGE
),
RDS_IB_EVENT_STRING
(
PKEY_CHANGE
),
RDS_IB_EVENT_STRING
(
SM_CHANGE
),
RDS_IB_EVENT_STRING
(
SRQ_ERR
),
RDS_IB_EVENT_STRING
(
SRQ_LIMIT_REACHED
),
RDS_IB_EVENT_STRING
(
QP_LAST_WQE_REACHED
),
RDS_IB_EVENT_STRING
(
CLIENT_REREGISTER
),
#undef RDS_IB_EVENT_STRING
};
static
char
*
rds_ib_event_str
(
enum
ib_event_type
type
)
{
return
rds_str_array
(
rds_ib_event_type_strings
,
ARRAY_SIZE
(
rds_ib_event_type_strings
),
type
);
};
/*
* Set the selected protocol version
*/
...
...
@@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
{
const
struct
rds_ib_connect_private
*
dp
=
NULL
;
struct
rds_ib_connection
*
ic
=
conn
->
c_transport_data
;
struct
rds_ib_device
*
rds_ibdev
;
struct
ib_qp_attr
qp_attr
;
int
err
;
...
...
@@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
}
}
printk
(
KERN_NOTICE
"RDS/IB: connected to %pI4 version %u.%u%s
\n
"
,
&
conn
->
c_faddr
,
RDS_PROTOCOL_MAJOR
(
conn
->
c_version
),
RDS_PROTOCOL_MINOR
(
conn
->
c_version
),
ic
->
i_flowctl
?
", flow control"
:
""
);
if
(
conn
->
c_version
<
RDS_PROTOCOL
(
3
,
1
))
{
printk
(
KERN_NOTICE
"RDS/IB: Connection to %pI4 version %u.%u failed,"
" no longer supported
\n
"
,
&
conn
->
c_faddr
,
RDS_PROTOCOL_MAJOR
(
conn
->
c_version
),
RDS_PROTOCOL_MINOR
(
conn
->
c_version
));
rds_conn_destroy
(
conn
);
return
;
}
else
{
printk
(
KERN_NOTICE
"RDS/IB: connected to %pI4 version %u.%u%s
\n
"
,
&
conn
->
c_faddr
,
RDS_PROTOCOL_MAJOR
(
conn
->
c_version
),
RDS_PROTOCOL_MINOR
(
conn
->
c_version
),
ic
->
i_flowctl
?
", flow control"
:
""
);
}
/*
* Init rings and fill recv. this needs to wait until protocol negotiation
...
...
@@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_ib_recv_init_ring
(
ic
);
/* Post receive buffers - as a side effect, this will update
* the posted credit count. */
rds_ib_recv_refill
(
conn
,
GFP_KERNEL
,
GFP_HIGHUSER
,
1
);
rds_ib_recv_refill
(
conn
,
1
);
/* Tune RNR behavior */
rds_ib_tune_rnr
(
ic
,
&
qp_attr
);
...
...
@@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
if
(
err
)
printk
(
KERN_NOTICE
"ib_modify_qp(IB_QP_STATE, RTS): err=%d
\n
"
,
err
);
/* update ib_device with this local ipaddr & conn */
rds_ibdev
=
ib_get_client_data
(
ic
->
i_cm_id
->
device
,
&
rds_ib_client
);
err
=
rds_ib_update_ipaddr
(
rds_ibdev
,
conn
->
c_laddr
);
/* update ib_device with this local ipaddr */
err
=
rds_ib_update_ipaddr
(
ic
->
rds_ibdev
,
conn
->
c_laddr
);
if
(
err
)
printk
(
KERN_ERR
"rds_ib_update_ipaddr failed (%d)
\n
"
,
err
);
rds_ib_add_conn
(
rds_ibdev
,
conn
);
printk
(
KERN_ERR
"rds_ib_update_ipaddr failed (%d)
\n
"
,
err
);
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
...
...
@@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
static
void
rds_ib_cm_fill_conn_param
(
struct
rds_connection
*
conn
,
struct
rdma_conn_param
*
conn_param
,
struct
rds_ib_connect_private
*
dp
,
u32
protocol_version
)
u32
protocol_version
,
u32
max_responder_resources
,
u32
max_initiator_depth
)
{
struct
rds_ib_connection
*
ic
=
conn
->
c_transport_data
;
struct
rds_ib_device
*
rds_ibdev
=
ic
->
rds_ibdev
;
memset
(
conn_param
,
0
,
sizeof
(
struct
rdma_conn_param
));
/* XXX tune these? */
conn_param
->
responder_resources
=
1
;
conn_param
->
initiator_depth
=
1
;
conn_param
->
responder_resources
=
min_t
(
u32
,
rds_ibdev
->
max_responder_resources
,
max_responder_resources
);
conn_param
->
initiator_depth
=
min_t
(
u32
,
rds_ibdev
->
max_initiator_depth
,
max_initiator_depth
);
conn_param
->
retry_count
=
min_t
(
unsigned
int
,
rds_ib_retry_count
,
7
);
conn_param
->
rnr_retry_count
=
7
;
if
(
dp
)
{
struct
rds_ib_connection
*
ic
=
conn
->
c_transport_data
;
memset
(
dp
,
0
,
sizeof
(
*
dp
));
dp
->
dp_saddr
=
conn
->
c_laddr
;
dp
->
dp_daddr
=
conn
->
c_faddr
;
...
...
@@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
static
void
rds_ib_cq_event_handler
(
struct
ib_event
*
event
,
void
*
data
)
{
rdsdebug
(
"event %u data %p
\n
"
,
event
->
event
,
data
);
rdsdebug
(
"event %u (%s) data %p
\n
"
,
event
->
event
,
rds_ib_event_str
(
event
->
event
),
data
);
}
static
void
rds_ib_qp_event_handler
(
struct
ib_event
*
event
,
void
*
data
)
...
...
@@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
struct
rds_connection
*
conn
=
data
;
struct
rds_ib_connection
*
ic
=
conn
->
c_transport_data
;
rdsdebug
(
"conn %p ic %p event %u
\n
"
,
conn
,
ic
,
event
->
event
);
rdsdebug
(
"conn %p ic %p event %u (%s)
\n
"
,
conn
,
ic
,
event
->
event
,
rds_ib_event_str
(
event
->
event
));
switch
(
event
->
event
)
{
case
IB_EVENT_COMM_EST
:
rdma_notify
(
ic
->
i_cm_id
,
IB_EVENT_COMM_EST
);
break
;
default:
rdsdebug
(
"Fatal QP Event %u "
rdsdebug
(
"Fatal QP Event %u
(%s)
"
"- connection %pI4->%pI4, reconnecting
\n
"
,
event
->
event
,
&
conn
->
c_laddr
,
&
conn
->
c_faddr
);
event
->
event
,
rds_ib_event_str
(
event
->
event
),
&
conn
->
c_laddr
,
&
conn
->
c_faddr
);
rds_conn_drop
(
conn
);
break
;
}
...
...
@@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct
rds_ib_device
*
rds_ibdev
;
int
ret
;
/* rds_ib_add_one creates a rds_ib_device object per IB device,
* and allocates a protection domain, memory range and FMR pool
* for each. If that fails for any reason, it will not register
* the rds_ibdev at all.
/*
* It's normal to see a null device if an incoming connection races
* with device removal, so we don't print a warning.
*/
rds_ibdev
=
ib_get_client_data
(
dev
,
&
rds_ib_client
);
if
(
rds_ibdev
==
NULL
)
{
if
(
printk_ratelimit
())
printk
(
KERN_NOTICE
"RDS/IB: No client_data for device %s
\n
"
,
dev
->
name
);
rds_ibdev
=
rds_ib_get_client_data
(
dev
);
if
(
!
rds_ibdev
)
return
-
EOPNOTSUPP
;
}
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn
(
rds_ibdev
,
conn
);
if
(
rds_ibdev
->
max_wrs
<
ic
->
i_send_ring
.
w_nr
+
1
)
rds_ib_ring_resize
(
&
ic
->
i_send_ring
,
rds_ibdev
->
max_wrs
-
1
);
...
...
@@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic
->
i_send_ring
.
w_nr
*
sizeof
(
struct
rds_header
),
&
ic
->
i_send_hdrs_dma
,
GFP_KERNEL
);
if
(
ic
->
i_send_hdrs
==
NULL
)
{
if
(
!
ic
->
i_send_hdrs
)
{
ret
=
-
ENOMEM
;
rdsdebug
(
"ib_dma_alloc_coherent send failed
\n
"
);
goto
out
;
...
...
@@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic
->
i_recv_ring
.
w_nr
*
sizeof
(
struct
rds_header
),
&
ic
->
i_recv_hdrs_dma
,
GFP_KERNEL
);
if
(
ic
->
i_recv_hdrs
==
NULL
)
{
if
(
!
ic
->
i_recv_hdrs
)
{
ret
=
-
ENOMEM
;
rdsdebug
(
"ib_dma_alloc_coherent recv failed
\n
"
);
goto
out
;
...
...
@@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic
->
i_ack
=
ib_dma_alloc_coherent
(
dev
,
sizeof
(
struct
rds_header
),
&
ic
->
i_ack_dma
,
GFP_KERNEL
);
if
(
ic
->
i_ack
==
NULL
)
{
if
(
!
ic
->
i_ack
)
{
ret
=
-
ENOMEM
;
rdsdebug
(
"ib_dma_alloc_coherent ack failed
\n
"
);
goto
out
;
}
ic
->
i_sends
=
vmalloc
(
ic
->
i_send_ring
.
w_nr
*
sizeof
(
struct
rds_ib_send_work
));
if
(
ic
->
i_sends
==
NULL
)
{
ic
->
i_sends
=
vmalloc_node
(
ic
->
i_send_ring
.
w_nr
*
sizeof
(
struct
rds_ib_send_work
),
ibdev_to_node
(
dev
));
if
(
!
ic
->
i_sends
)
{
ret
=
-
ENOMEM
;
rdsdebug
(
"send allocation failed
\n
"
);
goto
out
;
}
memset
(
ic
->
i_sends
,
0
,
ic
->
i_send_ring
.
w_nr
*
sizeof
(
struct
rds_ib_send_work
));
ic
->
i_recvs
=
vmalloc
(
ic
->
i_recv_ring
.
w_nr
*
sizeof
(
struct
rds_ib_recv_work
));
if
(
ic
->
i_recvs
==
NULL
)
{
ic
->
i_recvs
=
vmalloc_node
(
ic
->
i_recv_ring
.
w_nr
*
sizeof
(
struct
rds_ib_recv_work
),
ibdev_to_node
(
dev
));
if
(
!
ic
->
i_recvs
)
{
ret
=
-
ENOMEM
;
rdsdebug
(
"recv allocation failed
\n
"
);
goto
out
;
...
...
@@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
ic
->
i_send_cq
,
ic
->
i_recv_cq
);
out:
rds_ib_dev_put
(
rds_ibdev
);
return
ret
;
}
...
...
@@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
struct
rds_ib_connection
*
ic
=
NULL
;
struct
rdma_conn_param
conn_param
;
u32
version
;
int
err
,
destroy
=
1
;
int
err
=
1
,
destroy
=
1
;
/* Check whether the remote protocol version matches ours. */
version
=
rds_ib_protocol_compatible
(
event
);
...
...
@@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
/* Wait and see - our connect may still be succeeding */
rds_ib_stats_inc
(
s_ib_connect_raced
);
}
mutex_unlock
(
&
conn
->
c_cm_lock
);
goto
out
;
}
...
...
@@ -479,20 +525,20 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
goto
out
;
}
rds_ib_cm_fill_conn_param
(
conn
,
&
conn_param
,
&
dp_rep
,
version
);
rds_ib_cm_fill_conn_param
(
conn
,
&
conn_param
,
&
dp_rep
,
version
,
event
->
param
.
conn
.
responder_resources
,
event
->
param
.
conn
.
initiator_depth
);
/* rdma_accept() calls rdma_reject() internally if it fails */
err
=
rdma_accept
(
cm_id
,
&
conn_param
);
mutex_unlock
(
&
conn
->
c_cm_lock
);
if
(
err
)
{
if
(
err
)
rds_ib_conn_error
(
conn
,
"rdma_accept failed (%d)
\n
"
,
err
);
goto
out
;
}
return
0
;
out:
rdma_reject
(
cm_id
,
NULL
,
0
);
if
(
conn
)
mutex_unlock
(
&
conn
->
c_cm_lock
);
if
(
err
)
rdma_reject
(
cm_id
,
NULL
,
0
);
return
destroy
;
}
...
...
@@ -516,8 +562,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
goto
out
;
}
rds_ib_cm_fill_conn_param
(
conn
,
&
conn_param
,
&
dp
,
RDS_PROTOCOL_VERSION
);
rds_ib_cm_fill_conn_param
(
conn
,
&
conn_param
,
&
dp
,
RDS_PROTOCOL_VERSION
,
UINT_MAX
,
UINT_MAX
);
ret
=
rdma_connect
(
cm_id
,
&
conn_param
);
if
(
ret
)
rds_ib_conn_error
(
conn
,
"rdma_connect failed (%d)
\n
"
,
ret
);
...
...
@@ -601,9 +647,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
ic
->
i_cm_id
,
err
);
}
/*
* We want to wait for tx and rx completion to finish
* before we tear down the connection, but we have to be
* careful not to get stuck waiting on a send ring that
* only has unsignaled sends in it. We've shutdown new
* sends before getting here so by waiting for signaled
* sends to complete we're ensured that there will be no
* more tx processing.
*/
wait_event
(
rds_ib_ring_empty_wait
,
rds_ib_ring_empty
(
&
ic
->
i_send_ring
)
&&
rds_ib_ring_empty
(
&
ic
->
i_recv_ring
));
rds_ib_ring_empty
(
&
ic
->
i_recv_ring
)
&&
(
atomic_read
(
&
ic
->
i_signaled_sends
)
==
0
));
tasklet_kill
(
&
ic
->
i_recv_tasklet
);
if
(
ic
->
i_send_hdrs
)
ib_dma_free_coherent
(
dev
,
...
...
@@ -654,9 +710,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
BUG_ON
(
ic
->
rds_ibdev
);
/* Clear pending transmit */
if
(
ic
->
i_rm
)
{
rds_message_put
(
ic
->
i_rm
);
ic
->
i_rm
=
NULL
;
if
(
ic
->
i_data_op
)
{
struct
rds_message
*
rm
;
rm
=
container_of
(
ic
->
i_data_op
,
struct
rds_message
,
data
);
rds_message_put
(
rm
);
ic
->
i_data_op
=
NULL
;
}
/* Clear the ACK state */
...
...
@@ -690,12 +749,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
{
struct
rds_ib_connection
*
ic
;
unsigned
long
flags
;
int
ret
;
/* XXX too lazy? */
ic
=
kzalloc
(
sizeof
(
struct
rds_ib_connection
),
GFP_KERNEL
);
if
(
ic
==
NULL
)
if
(
!
ic
)
return
-
ENOMEM
;
ret
=
rds_ib_recv_alloc_caches
(
ic
);
if
(
ret
)
{
kfree
(
ic
);
return
ret
;
}
INIT_LIST_HEAD
(
&
ic
->
ib_node
);
tasklet_init
(
&
ic
->
i_recv_tasklet
,
rds_ib_recv_tasklet_fn
,
(
unsigned
long
)
ic
);
...
...
@@ -703,6 +769,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init
(
&
ic
->
i_ack_lock
);
#endif
atomic_set
(
&
ic
->
i_signaled_sends
,
0
);
/*
* rds_ib_conn_shutdown() waits for these to be emptied so they
...
...
@@ -744,6 +811,8 @@ void rds_ib_conn_free(void *arg)
list_del
(
&
ic
->
ib_node
);
spin_unlock_irq
(
lock_ptr
);
rds_ib_recv_free_caches
(
ic
);
kfree
(
ic
);
}
...
...
net/rds/ib_rdma.c
View file @
cf0ac2b8
...
...
@@ -32,11 +32,16 @@
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/rculist.h>
#include "rds.h"
#include "rdma.h"
#include "ib.h"
#include "xlist.h"
struct
workqueue_struct
*
rds_ib_fmr_wq
;
static
DEFINE_PER_CPU
(
unsigned
long
,
clean_list_grace
);
#define CLEAN_LIST_BUSY_BIT 0
/*
* This is stored as mr->r_trans_private.
...
...
@@ -45,7 +50,11 @@ struct rds_ib_mr {
struct
rds_ib_device
*
device
;
struct
rds_ib_mr_pool
*
pool
;
struct
ib_fmr
*
fmr
;
struct
list_head
list
;
struct
xlist_head
xlist
;
/* unmap_list is for freeing */
struct
list_head
unmap_list
;
unsigned
int
remap_count
;
struct
scatterlist
*
sg
;
...
...
@@ -59,14 +68,16 @@ struct rds_ib_mr {
*/
struct
rds_ib_mr_pool
{
struct
mutex
flush_lock
;
/* serialize fmr invalidate */
struct
work_struct
flush_worker
;
/* flush worker */
struct
delayed_work
flush_worker
;
/* flush worker */
spinlock_t
list_lock
;
/* protect variables below */
atomic_t
item_count
;
/* total # of MRs */
atomic_t
dirty_count
;
/* # dirty of MRs */
struct
list_head
drop_list
;
/* MRs that have reached their max_maps limit */
struct
list_head
free_list
;
/* unused MRs */
struct
list_head
clean_list
;
/* unused & unamapped MRs */
struct
xlist_head
drop_list
;
/* MRs that have reached their max_maps limit */
struct
xlist_head
free_list
;
/* unused MRs */
struct
xlist_head
clean_list
;
/* global unused & unamapped MRs */
wait_queue_head_t
flush_wait
;
atomic_t
free_pinned
;
/* memory pinned by free MRs */
unsigned
long
max_items
;
unsigned
long
max_items_soft
;
...
...
@@ -74,7 +85,7 @@ struct rds_ib_mr_pool {
struct
ib_fmr_attr
fmr_attr
;
};
static
int
rds_ib_flush_mr_pool
(
struct
rds_ib_mr_pool
*
pool
,
int
free_all
);
static
int
rds_ib_flush_mr_pool
(
struct
rds_ib_mr_pool
*
pool
,
int
free_all
,
struct
rds_ib_mr
**
);
static
void
rds_ib_teardown_mr
(
struct
rds_ib_mr
*
ibmr
);
static
void
rds_ib_mr_pool_flush_worker
(
struct
work_struct
*
work
);
...
...
@@ -83,16 +94,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
struct
rds_ib_device
*
rds_ibdev
;
struct
rds_ib_ipaddr
*
i_ipaddr
;
list_for_each_entry
(
rds_ibdev
,
&
rds_ib_devices
,
list
)
{
spin_lock_irq
(
&
rds_ibdev
->
spinlock
);
list_for_each_entry
(
i_ipaddr
,
&
rds_ibdev
->
ipaddr_list
,
list
)
{
rcu_read_lock
();
list_for_each_entry_rcu
(
rds_ibdev
,
&
rds_ib_devices
,
list
)
{
list_for_each_entry
_rcu
(
i_ipaddr
,
&
rds_ibdev
->
ipaddr_list
,
list
)
{
if
(
i_ipaddr
->
ipaddr
==
ipaddr
)
{
spin_unlock_irq
(
&
rds_ibdev
->
spinlock
);
atomic_inc
(
&
rds_ibdev
->
refcount
);
rcu_read_unlock
();
return
rds_ibdev
;
}
}
spin_unlock_irq
(
&
rds_ibdev
->
spinlock
);
}
rcu_read_unlock
();
return
NULL
;
}
...
...
@@ -108,7 +120,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
i_ipaddr
->
ipaddr
=
ipaddr
;
spin_lock_irq
(
&
rds_ibdev
->
spinlock
);
list_add_tail
(
&
i_ipaddr
->
list
,
&
rds_ibdev
->
ipaddr_list
);
list_add_tail
_rcu
(
&
i_ipaddr
->
list
,
&
rds_ibdev
->
ipaddr_list
);
spin_unlock_irq
(
&
rds_ibdev
->
spinlock
);
return
0
;
...
...
@@ -116,17 +128,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
static
void
rds_ib_remove_ipaddr
(
struct
rds_ib_device
*
rds_ibdev
,
__be32
ipaddr
)
{
struct
rds_ib_ipaddr
*
i_ipaddr
,
*
next
;
struct
rds_ib_ipaddr
*
i_ipaddr
;
struct
rds_ib_ipaddr
*
to_free
=
NULL
;
spin_lock_irq
(
&
rds_ibdev
->
spinlock
);
list_for_each_entry_
safe
(
i_ipaddr
,
next
,
&
rds_ibdev
->
ipaddr_list
,
list
)
{
list_for_each_entry_
rcu
(
i_ipaddr
,
&
rds_ibdev
->
ipaddr_list
,
list
)
{
if
(
i_ipaddr
->
ipaddr
==
ipaddr
)
{
list_del
(
&
i_ipaddr
->
list
);
kfree
(
i_ipaddr
)
;
list_del
_rcu
(
&
i_ipaddr
->
list
);
to_free
=
i_ipaddr
;
break
;
}
}
spin_unlock_irq
(
&
rds_ibdev
->
spinlock
);
if
(
to_free
)
{
synchronize_rcu
();
kfree
(
to_free
);
}
}
int
rds_ib_update_ipaddr
(
struct
rds_ib_device
*
rds_ibdev
,
__be32
ipaddr
)
...
...
@@ -134,8 +153,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
struct
rds_ib_device
*
rds_ibdev_old
;
rds_ibdev_old
=
rds_ib_get_device
(
ipaddr
);
if
(
rds_ibdev_old
)
if
(
rds_ibdev_old
)
{
rds_ib_remove_ipaddr
(
rds_ibdev_old
,
ipaddr
);
rds_ib_dev_put
(
rds_ibdev_old
);
}
return
rds_ib_add_ipaddr
(
rds_ibdev
,
ipaddr
);
}
...
...
@@ -156,6 +177,7 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con
spin_unlock_irq
(
&
ib_nodev_conns_lock
);
ic
->
rds_ibdev
=
rds_ibdev
;
atomic_inc
(
&
rds_ibdev
->
refcount
);
}
void
rds_ib_remove_conn
(
struct
rds_ib_device
*
rds_ibdev
,
struct
rds_connection
*
conn
)
...
...
@@ -175,18 +197,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
spin_unlock
(
&
ib_nodev_conns_lock
);
ic
->
rds_ibdev
=
NULL
;
rds_ib_dev_put
(
rds_ibdev
);
}
void
__rds_ib_destroy_conns
(
struct
list_head
*
list
,
spinlock_t
*
list_lock
)
void
rds_ib_destroy_nodev_conns
(
void
)
{
struct
rds_ib_connection
*
ic
,
*
_ic
;
LIST_HEAD
(
tmp_list
);
/* avoid calling conn_destroy with irqs off */
spin_lock_irq
(
list_lock
);
list_splice
(
list
,
&
tmp_list
);
INIT_LIST_HEAD
(
list
);
spin_unlock_irq
(
list_lock
);
spin_lock_irq
(
&
ib_nodev_conns_lock
);
list_splice
(
&
ib_nodev_conns
,
&
tmp_list
);
spin_unlock_irq
(
&
ib_nodev_conns_lock
);
list_for_each_entry_safe
(
ic
,
_ic
,
&
tmp_list
,
ib_node
)
rds_conn_destroy
(
ic
->
conn
);
...
...
@@ -200,12 +222,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
if
(
!
pool
)
return
ERR_PTR
(
-
ENOMEM
);
INIT_LIST_HEAD
(
&
pool
->
free_list
);
INIT_LIST_HEAD
(
&
pool
->
drop_list
);
INIT_LIST_HEAD
(
&
pool
->
clean_list
);
INIT_
X
LIST_HEAD
(
&
pool
->
free_list
);
INIT_
X
LIST_HEAD
(
&
pool
->
drop_list
);
INIT_
X
LIST_HEAD
(
&
pool
->
clean_list
);
mutex_init
(
&
pool
->
flush_lock
);
spin_lock_init
(
&
pool
->
list_lock
);
INIT_WORK
(
&
pool
->
flush_worker
,
rds_ib_mr_pool_flush_worker
);
init_waitqueue_head
(
&
pool
->
flush_wait
);
INIT_
DELAYED_
WORK
(
&
pool
->
flush_worker
,
rds_ib_mr_pool_flush_worker
);
pool
->
fmr_attr
.
max_pages
=
fmr_message_size
;
pool
->
fmr_attr
.
max_maps
=
rds_ibdev
->
fmr_max_remaps
;
...
...
@@ -233,34 +255,60 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
void
rds_ib_destroy_mr_pool
(
struct
rds_ib_mr_pool
*
pool
)
{
flush_workqueue
(
rds_wq
);
rds_ib_flush_mr_pool
(
pool
,
1
);
cancel_delayed_work_sync
(
&
pool
->
flush_worker
);
rds_ib_flush_mr_pool
(
pool
,
1
,
NULL
);
WARN_ON
(
atomic_read
(
&
pool
->
item_count
));
WARN_ON
(
atomic_read
(
&
pool
->
free_pinned
));
kfree
(
pool
);
}
static
void
refill_local
(
struct
rds_ib_mr_pool
*
pool
,
struct
xlist_head
*
xl
,
struct
rds_ib_mr
**
ibmr_ret
)
{
struct
xlist_head
*
ibmr_xl
;
ibmr_xl
=
xlist_del_head_fast
(
xl
);
*
ibmr_ret
=
list_entry
(
ibmr_xl
,
struct
rds_ib_mr
,
xlist
);
}
static
inline
struct
rds_ib_mr
*
rds_ib_reuse_fmr
(
struct
rds_ib_mr_pool
*
pool
)
{
struct
rds_ib_mr
*
ibmr
=
NULL
;
unsigned
long
flags
;
struct
xlist_head
*
ret
;
unsigned
long
*
flag
;
spin_lock_irqsave
(
&
pool
->
list_lock
,
flags
);
if
(
!
list_empty
(
&
pool
->
clean_list
))
{
ibmr
=
list_entry
(
pool
->
clean_list
.
next
,
struct
rds_ib_mr
,
list
);
list_del_init
(
&
ibmr
->
list
);
}
spin_unlock_irqrestore
(
&
pool
->
list_lock
,
flags
);
preempt_disable
(
);
flag
=
&
__get_cpu_var
(
clean_list_grace
);
set_bit
(
CLEAN_LIST_BUSY_BIT
,
flag
);
ret
=
xlist_del_head
(
&
pool
->
clean_
list
);
if
(
ret
)
ibmr
=
list_entry
(
ret
,
struct
rds_ib_mr
,
xlist
);
clear_bit
(
CLEAN_LIST_BUSY_BIT
,
flag
);
preempt_enable
();
return
ibmr
;
}
static
inline
void
wait_clean_list_grace
(
void
)
{
int
cpu
;
unsigned
long
*
flag
;
for_each_online_cpu
(
cpu
)
{
flag
=
&
per_cpu
(
clean_list_grace
,
cpu
);
while
(
test_bit
(
CLEAN_LIST_BUSY_BIT
,
flag
))
cpu_relax
();
}
}
static
struct
rds_ib_mr
*
rds_ib_alloc_fmr
(
struct
rds_ib_device
*
rds_ibdev
)
{
struct
rds_ib_mr_pool
*
pool
=
rds_ibdev
->
mr_pool
;
struct
rds_ib_mr
*
ibmr
=
NULL
;
int
err
=
0
,
iter
=
0
;
if
(
atomic_read
(
&
pool
->
dirty_count
)
>=
pool
->
max_items
/
10
)
queue_delayed_work
(
rds_ib_fmr_wq
,
&
pool
->
flush_worker
,
10
);
while
(
1
)
{
ibmr
=
rds_ib_reuse_fmr
(
pool
);
if
(
ibmr
)
...
...
@@ -287,19 +335,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
/* We do have some empty MRs. Flush them out. */
rds_ib_stats_inc
(
s_ib_rdma_mr_pool_wait
);
rds_ib_flush_mr_pool
(
pool
,
0
);
rds_ib_flush_mr_pool
(
pool
,
0
,
&
ibmr
);
if
(
ibmr
)
return
ibmr
;
}
ibmr
=
kzalloc
(
sizeof
(
*
ibmr
),
GFP_KERNEL
);
ibmr
=
kzalloc
_node
(
sizeof
(
*
ibmr
),
GFP_KERNEL
,
rdsibdev_to_node
(
rds_ibdev
)
);
if
(
!
ibmr
)
{
err
=
-
ENOMEM
;
goto
out_no_cigar
;
}
memset
(
ibmr
,
0
,
sizeof
(
*
ibmr
));
ibmr
->
fmr
=
ib_alloc_fmr
(
rds_ibdev
->
pd
,
(
IB_ACCESS_LOCAL_WRITE
|
IB_ACCESS_REMOTE_READ
|
IB_ACCESS_REMOTE_WRITE
),
IB_ACCESS_REMOTE_WRITE
|
IB_ACCESS_REMOTE_ATOMIC
),
&
pool
->
fmr_attr
);
if
(
IS_ERR
(
ibmr
->
fmr
))
{
err
=
PTR_ERR
(
ibmr
->
fmr
);
...
...
@@ -367,7 +420,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
if
(
page_cnt
>
fmr_message_size
)
return
-
EINVAL
;
dma_pages
=
kmalloc
(
sizeof
(
u64
)
*
page_cnt
,
GFP_ATOMIC
);
dma_pages
=
kmalloc_node
(
sizeof
(
u64
)
*
page_cnt
,
GFP_ATOMIC
,
rdsibdev_to_node
(
rds_ibdev
));
if
(
!
dma_pages
)
return
-
ENOMEM
;
...
...
@@ -441,7 +495,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
/* FIXME we need a way to tell a r/w MR
* from a r/o MR */
BUG_ON
(
i
n_interrupt
());
BUG_ON
(
i
rqs_disabled
());
set_page_dirty
(
page
);
put_page
(
page
);
}
...
...
@@ -476,34 +530,110 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr
return
0
;
}
/*
* given an xlist of mrs, put them all into the list_head for more processing
*/
static
void
xlist_append_to_list
(
struct
xlist_head
*
xlist
,
struct
list_head
*
list
)
{
struct
rds_ib_mr
*
ibmr
;
struct
xlist_head
splice
;
struct
xlist_head
*
cur
;
struct
xlist_head
*
next
;
splice
.
next
=
NULL
;
xlist_splice
(
xlist
,
&
splice
);
cur
=
splice
.
next
;
while
(
cur
)
{
next
=
cur
->
next
;
ibmr
=
list_entry
(
cur
,
struct
rds_ib_mr
,
xlist
);
list_add_tail
(
&
ibmr
->
unmap_list
,
list
);
cur
=
next
;
}
}
/*
* this takes a list head of mrs and turns it into an xlist of clusters.
* each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for
* reuse.
*/
static
void
list_append_to_xlist
(
struct
rds_ib_mr_pool
*
pool
,
struct
list_head
*
list
,
struct
xlist_head
*
xlist
,
struct
xlist_head
**
tail_ret
)
{
struct
rds_ib_mr
*
ibmr
;
struct
xlist_head
*
cur_mr
=
xlist
;
struct
xlist_head
*
tail_mr
=
NULL
;
list_for_each_entry
(
ibmr
,
list
,
unmap_list
)
{
tail_mr
=
&
ibmr
->
xlist
;
tail_mr
->
next
=
NULL
;
cur_mr
->
next
=
tail_mr
;
cur_mr
=
tail_mr
;
}
*
tail_ret
=
tail_mr
;
}
/*
* Flush our pool of MRs.
* At a minimum, all currently unused MRs are unmapped.
* If the number of MRs allocated exceeds the limit, we also try
* to free as many MRs as needed to get back to this limit.
*/
static
int
rds_ib_flush_mr_pool
(
struct
rds_ib_mr_pool
*
pool
,
int
free_all
)
static
int
rds_ib_flush_mr_pool
(
struct
rds_ib_mr_pool
*
pool
,
int
free_all
,
struct
rds_ib_mr
**
ibmr_ret
)
{
struct
rds_ib_mr
*
ibmr
,
*
next
;
struct
xlist_head
clean_xlist
;
struct
xlist_head
*
clean_tail
;
LIST_HEAD
(
unmap_list
);
LIST_HEAD
(
fmr_list
);
unsigned
long
unpinned
=
0
;
unsigned
long
flags
;
unsigned
int
nfreed
=
0
,
ncleaned
=
0
,
free_goal
;
int
ret
=
0
;
rds_ib_stats_inc
(
s_ib_rdma_mr_pool_flush
);
mutex_lock
(
&
pool
->
flush_lock
);
if
(
ibmr_ret
)
{
DEFINE_WAIT
(
wait
);
while
(
!
mutex_trylock
(
&
pool
->
flush_lock
))
{
ibmr
=
rds_ib_reuse_fmr
(
pool
);
if
(
ibmr
)
{
*
ibmr_ret
=
ibmr
;
finish_wait
(
&
pool
->
flush_wait
,
&
wait
);
goto
out_nolock
;
}
prepare_to_wait
(
&
pool
->
flush_wait
,
&
wait
,
TASK_UNINTERRUPTIBLE
);
if
(
xlist_empty
(
&
pool
->
clean_list
))
schedule
();
ibmr
=
rds_ib_reuse_fmr
(
pool
);
if
(
ibmr
)
{
*
ibmr_ret
=
ibmr
;
finish_wait
(
&
pool
->
flush_wait
,
&
wait
);
goto
out_nolock
;
}
}
finish_wait
(
&
pool
->
flush_wait
,
&
wait
);
}
else
mutex_lock
(
&
pool
->
flush_lock
);
if
(
ibmr_ret
)
{
ibmr
=
rds_ib_reuse_fmr
(
pool
);
if
(
ibmr
)
{
*
ibmr_ret
=
ibmr
;
goto
out
;
}
}
spin_lock_irqsave
(
&
pool
->
list_lock
,
flags
);
/* Get the list of all MRs to be dropped. Ordering matters -
* we want to put drop_list ahead of free_list. */
list_splice_init
(
&
pool
->
free_list
,
&
unmap_list
);
list_splice_init
(
&
pool
->
drop_list
,
&
unmap_list
);
* we want to put drop_list ahead of free_list.
*/
xlist_append_to_list
(
&
pool
->
drop_list
,
&
unmap_list
);
xlist_append_to_list
(
&
pool
->
free_list
,
&
unmap_list
);
if
(
free_all
)
list_splice_init
(
&
pool
->
clean_list
,
&
unmap_list
);
spin_unlock_irqrestore
(
&
pool
->
list_lock
,
flags
);
xlist_append_to_list
(
&
pool
->
clean_list
,
&
unmap_list
);
free_goal
=
rds_ib_flush_goal
(
pool
,
free_all
);
...
...
@@ -511,19 +641,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
goto
out
;
/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
list_for_each_entry
(
ibmr
,
&
unmap_list
,
list
)
list_for_each_entry
(
ibmr
,
&
unmap_list
,
unmap_
list
)
list_add
(
&
ibmr
->
fmr
->
list
,
&
fmr_list
);
ret
=
ib_unmap_fmr
(
&
fmr_list
);
if
(
ret
)
printk
(
KERN_WARNING
"RDS/IB: ib_unmap_fmr failed (err=%d)
\n
"
,
ret
);
/* Now we can destroy the DMA mapping and unpin any pages */
list_for_each_entry_safe
(
ibmr
,
next
,
&
unmap_list
,
list
)
{
list_for_each_entry_safe
(
ibmr
,
next
,
&
unmap_list
,
unmap_
list
)
{
unpinned
+=
ibmr
->
sg_len
;
__rds_ib_teardown_mr
(
ibmr
);
if
(
nfreed
<
free_goal
||
ibmr
->
remap_count
>=
pool
->
fmr_attr
.
max_maps
)
{
rds_ib_stats_inc
(
s_ib_rdma_mr_free
);
list_del
(
&
ibmr
->
list
);
list_del
(
&
ibmr
->
unmap_
list
);
ib_dealloc_fmr
(
ibmr
->
fmr
);
kfree
(
ibmr
);
nfreed
++
;
...
...
@@ -531,9 +662,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
ncleaned
++
;
}
spin_lock_irqsave
(
&
pool
->
list_lock
,
flags
);
list_splice
(
&
unmap_list
,
&
pool
->
clean_list
);
spin_unlock_irqrestore
(
&
pool
->
list_lock
,
flags
);
if
(
!
list_empty
(
&
unmap_list
))
{
/* we have to make sure that none of the things we're about
* to put on the clean list would race with other cpus trying
* to pull items off. The xlist would explode if we managed to
* remove something from the clean list and then add it back again
* while another CPU was spinning on that same item in xlist_del_head.
*
* This is pretty unlikely, but just in case wait for an xlist grace period
* here before adding anything back into the clean list.
*/
wait_clean_list_grace
();
list_append_to_xlist
(
pool
,
&
unmap_list
,
&
clean_xlist
,
&
clean_tail
);
if
(
ibmr_ret
)
refill_local
(
pool
,
&
clean_xlist
,
ibmr_ret
);
/* refill_local may have emptied our list */
if
(
!
xlist_empty
(
&
clean_xlist
))
xlist_add
(
clean_xlist
.
next
,
clean_tail
,
&
pool
->
clean_list
);
}
atomic_sub
(
unpinned
,
&
pool
->
free_pinned
);
atomic_sub
(
ncleaned
,
&
pool
->
dirty_count
);
...
...
@@ -541,14 +690,35 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
out:
mutex_unlock
(
&
pool
->
flush_lock
);
if
(
waitqueue_active
(
&
pool
->
flush_wait
))
wake_up
(
&
pool
->
flush_wait
);
out_nolock:
return
ret
;
}
int
rds_ib_fmr_init
(
void
)
{
rds_ib_fmr_wq
=
create_workqueue
(
"rds_fmr_flushd"
);
if
(
!
rds_ib_fmr_wq
)
return
-
ENOMEM
;
return
0
;
}
/*
* By the time this is called all the IB devices should have been torn down and
* had their pools freed. As each pool is freed its work struct is waited on,
* so the pool flushing work queue should be idle by the time we get here.
*/
void
rds_ib_fmr_exit
(
void
)
{
destroy_workqueue
(
rds_ib_fmr_wq
);
}
static
void
rds_ib_mr_pool_flush_worker
(
struct
work_struct
*
work
)
{
struct
rds_ib_mr_pool
*
pool
=
container_of
(
work
,
struct
rds_ib_mr_pool
,
flush_worker
);
struct
rds_ib_mr_pool
*
pool
=
container_of
(
work
,
struct
rds_ib_mr_pool
,
flush_worker
.
work
);
rds_ib_flush_mr_pool
(
pool
,
0
);
rds_ib_flush_mr_pool
(
pool
,
0
,
NULL
);
}
void
rds_ib_free_mr
(
void
*
trans_private
,
int
invalidate
)
...
...
@@ -556,47 +726,49 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
struct
rds_ib_mr
*
ibmr
=
trans_private
;
struct
rds_ib_device
*
rds_ibdev
=
ibmr
->
device
;
struct
rds_ib_mr_pool
*
pool
=
rds_ibdev
->
mr_pool
;
unsigned
long
flags
;
rdsdebug
(
"RDS/IB: free_mr nents %u
\n
"
,
ibmr
->
sg_len
);
/* Return it to the pool's free list */
spin_lock_irqsave
(
&
pool
->
list_lock
,
flags
);
if
(
ibmr
->
remap_count
>=
pool
->
fmr_attr
.
max_maps
)
list_add
(
&
ibmr
->
list
,
&
pool
->
drop_list
);
xlist_add
(
&
ibmr
->
xlist
,
&
ibmr
->
x
list
,
&
pool
->
drop_list
);
else
list_add
(
&
ibmr
->
list
,
&
pool
->
free_list
);
xlist_add
(
&
ibmr
->
xlist
,
&
ibmr
->
x
list
,
&
pool
->
free_list
);
atomic_add
(
ibmr
->
sg_len
,
&
pool
->
free_pinned
);
atomic_inc
(
&
pool
->
dirty_count
);
spin_unlock_irqrestore
(
&
pool
->
list_lock
,
flags
);
/* If we've pinned too many pages, request a flush */
if
(
atomic_read
(
&
pool
->
free_pinned
)
>=
pool
->
max_free_pinned
||
atomic_read
(
&
pool
->
dirty_count
)
>=
pool
->
max_items
/
10
)
queue_
work
(
rds_wq
,
&
pool
->
flush_worker
);
queue_
delayed_work
(
rds_ib_fmr_wq
,
&
pool
->
flush_worker
,
10
);
if
(
invalidate
)
{
if
(
likely
(
!
in_interrupt
()))
{
rds_ib_flush_mr_pool
(
pool
,
0
);
rds_ib_flush_mr_pool
(
pool
,
0
,
NULL
);
}
else
{
/* We get here if the user created a MR marked
* as use_once and invalidate at the same time. */
queue_work
(
rds_wq
,
&
pool
->
flush_worker
);
queue_delayed_work
(
rds_ib_fmr_wq
,
&
pool
->
flush_worker
,
10
);
}
}
rds_ib_dev_put
(
rds_ibdev
);
}
void
rds_ib_flush_mrs
(
void
)
{
struct
rds_ib_device
*
rds_ibdev
;
down_read
(
&
rds_ib_devices_lock
);
list_for_each_entry
(
rds_ibdev
,
&
rds_ib_devices
,
list
)
{
struct
rds_ib_mr_pool
*
pool
=
rds_ibdev
->
mr_pool
;
if
(
pool
)
rds_ib_flush_mr_pool
(
pool
,
0
);
rds_ib_flush_mr_pool
(
pool
,
0
,
NULL
);
}
up_read
(
&
rds_ib_devices_lock
);
}
void
*
rds_ib_get_mr
(
struct
scatterlist
*
sg
,
unsigned
long
nents
,
...
...
@@ -628,6 +800,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
printk
(
KERN_WARNING
"RDS/IB: map_fmr failed (errno=%d)
\n
"
,
ret
);
ibmr
->
device
=
rds_ibdev
;
rds_ibdev
=
NULL
;
out:
if
(
ret
)
{
...
...
@@ -635,5 +808,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
rds_ib_free_mr
(
ibmr
,
0
);
ibmr
=
ERR_PTR
(
ret
);
}
if
(
rds_ibdev
)
rds_ib_dev_put
(
rds_ibdev
);
return
ibmr
;
}
net/rds/ib_recv.c
View file @
cf0ac2b8
...
...
@@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab;
static
struct
kmem_cache
*
rds_ib_frag_slab
;
static
atomic_t
rds_ib_allocation
=
ATOMIC_INIT
(
0
);
static
void
rds_ib_frag_drop_page
(
struct
rds_page_frag
*
frag
)
{
rdsdebug
(
"frag %p page %p
\n
"
,
frag
,
frag
->
f_page
);
__free_page
(
frag
->
f_page
);
frag
->
f_page
=
NULL
;
}
static
void
rds_ib_frag_free
(
struct
rds_page_frag
*
frag
)
{
rdsdebug
(
"frag %p page %p
\n
"
,
frag
,
frag
->
f_page
);
BUG_ON
(
frag
->
f_page
!=
NULL
);
kmem_cache_free
(
rds_ib_frag_slab
,
frag
);
}
/*
* We map a page at a time. Its fragments are posted in order. This
* is called in fragment order as the fragments get send completion events.
* Only the last frag in the page performs the unmapping.
*
* It's OK for ring cleanup to call this in whatever order it likes because
* DMA is not in flight and so we can unmap while other ring entries still
* hold page references in their frags.
*/
static
void
rds_ib_recv_unmap_page
(
struct
rds_ib_connection
*
ic
,
struct
rds_ib_recv_work
*
recv
)
{
struct
rds_page_frag
*
frag
=
recv
->
r_frag
;
rdsdebug
(
"recv %p frag %p page %p
\n
"
,
recv
,
frag
,
frag
->
f_page
);
if
(
frag
->
f_mapped
)
ib_dma_unmap_page
(
ic
->
i_cm_id
->
device
,
frag
->
f_mapped
,
RDS_FRAG_SIZE
,
DMA_FROM_DEVICE
);
frag
->
f_mapped
=
0
;
}
void
rds_ib_recv_init_ring
(
struct
rds_ib_connection
*
ic
)
{
struct
rds_ib_recv_work
*
recv
;
...
...
@@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
recv
->
r_wr
.
sg_list
=
recv
->
r_sge
;
recv
->
r_wr
.
num_sge
=
RDS_IB_RECV_SGE
;
sge
=
rds_ib_data_sge
(
ic
,
recv
->
r_sge
);
sge
=
&
recv
->
r_sge
[
0
];
sge
->
addr
=
ic
->
i_recv_hdrs_dma
+
(
i
*
sizeof
(
struct
rds_header
));
sge
->
length
=
sizeof
(
struct
rds_header
);
sge
->
lkey
=
ic
->
i_mr
->
lkey
;
sge
=
&
recv
->
r_sge
[
1
];
sge
->
addr
=
0
;
sge
->
length
=
RDS_FRAG_SIZE
;
sge
->
lkey
=
ic
->
i_mr
->
lkey
;
}
}
sge
=
rds_ib_header_sge
(
ic
,
recv
->
r_sge
);
sge
->
addr
=
ic
->
i_recv_hdrs_dma
+
(
i
*
sizeof
(
struct
rds_header
));
sge
->
length
=
sizeof
(
struct
rds_header
);
sge
->
lkey
=
ic
->
i_mr
->
lkey
;
/*
* The entire 'from' list, including the from element itself, is put on
* to the tail of the 'to' list.
*/
static
void
list_splice_entire_tail
(
struct
list_head
*
from
,
struct
list_head
*
to
)
{
struct
list_head
*
from_last
=
from
->
prev
;
list_splice_tail
(
from_last
,
to
);
list_add_tail
(
from_last
,
to
);
}
static
void
rds_ib_cache_xfer_to_ready
(
struct
rds_ib_refill_cache
*
cache
)
{
struct
list_head
*
tmp
;
tmp
=
xchg
(
&
cache
->
xfer
,
NULL
);
if
(
tmp
)
{
if
(
cache
->
ready
)
list_splice_entire_tail
(
tmp
,
cache
->
ready
);
else
cache
->
ready
=
tmp
;
}
}
static
int
rds_ib_recv_alloc_cache
(
struct
rds_ib_refill_cache
*
cache
)
{
struct
rds_ib_cache_head
*
head
;
int
cpu
;
cache
->
percpu
=
alloc_percpu
(
struct
rds_ib_cache_head
);
if
(
!
cache
->
percpu
)
return
-
ENOMEM
;
for_each_possible_cpu
(
cpu
)
{
head
=
per_cpu_ptr
(
cache
->
percpu
,
cpu
);
head
->
first
=
NULL
;
head
->
count
=
0
;
}
cache
->
xfer
=
NULL
;
cache
->
ready
=
NULL
;
return
0
;
}
int
rds_ib_recv_alloc_caches
(
struct
rds_ib_connection
*
ic
)
{
int
ret
;
ret
=
rds_ib_recv_alloc_cache
(
&
ic
->
i_cache_incs
);
if
(
!
ret
)
{
ret
=
rds_ib_recv_alloc_cache
(
&
ic
->
i_cache_frags
);
if
(
ret
)
free_percpu
(
ic
->
i_cache_incs
.
percpu
);
}
return
ret
;
}
static
void
rds_ib_cache_splice_all_lists
(
struct
rds_ib_refill_cache
*
cache
,
struct
list_head
*
caller_list
)
{
struct
rds_ib_cache_head
*
head
;
int
cpu
;
for_each_possible_cpu
(
cpu
)
{
head
=
per_cpu_ptr
(
cache
->
percpu
,
cpu
);
if
(
head
->
first
)
{
list_splice_entire_tail
(
head
->
first
,
caller_list
);
head
->
first
=
NULL
;
}
}
if
(
cache
->
ready
)
{
list_splice_entire_tail
(
cache
->
ready
,
caller_list
);
cache
->
ready
=
NULL
;
}
}
void
rds_ib_recv_free_caches
(
struct
rds_ib_connection
*
ic
)
{
struct
rds_ib_incoming
*
inc
;
struct
rds_ib_incoming
*
inc_tmp
;
struct
rds_page_frag
*
frag
;
struct
rds_page_frag
*
frag_tmp
;
LIST_HEAD
(
list
);
rds_ib_cache_xfer_to_ready
(
&
ic
->
i_cache_incs
);
rds_ib_cache_splice_all_lists
(
&
ic
->
i_cache_incs
,
&
list
);
free_percpu
(
ic
->
i_cache_incs
.
percpu
);
list_for_each_entry_safe
(
inc
,
inc_tmp
,
&
list
,
ii_cache_entry
)
{
list_del
(
&
inc
->
ii_cache_entry
);
WARN_ON
(
!
list_empty
(
&
inc
->
ii_frags
));
kmem_cache_free
(
rds_ib_incoming_slab
,
inc
);
}
rds_ib_cache_xfer_to_ready
(
&
ic
->
i_cache_frags
);
rds_ib_cache_splice_all_lists
(
&
ic
->
i_cache_frags
,
&
list
);
free_percpu
(
ic
->
i_cache_frags
.
percpu
);
list_for_each_entry_safe
(
frag
,
frag_tmp
,
&
list
,
f_cache_entry
)
{
list_del
(
&
frag
->
f_cache_entry
);
WARN_ON
(
!
list_empty
(
&
frag
->
f_item
));
kmem_cache_free
(
rds_ib_frag_slab
,
frag
);
}
}
/* fwd decl */
static
void
rds_ib_recv_cache_put
(
struct
list_head
*
new_item
,
struct
rds_ib_refill_cache
*
cache
);
static
struct
list_head
*
rds_ib_recv_cache_get
(
struct
rds_ib_refill_cache
*
cache
);
/* Recycle frag and attached recv buffer f_sg */
static
void
rds_ib_frag_free
(
struct
rds_ib_connection
*
ic
,
struct
rds_page_frag
*
frag
)
{
rdsdebug
(
"frag %p page %p
\n
"
,
frag
,
sg_page
(
&
frag
->
f_sg
));
rds_ib_recv_cache_put
(
&
frag
->
f_cache_entry
,
&
ic
->
i_cache_frags
);
}
/* Recycle inc after freeing attached frags */
void
rds_ib_inc_free
(
struct
rds_incoming
*
inc
)
{
struct
rds_ib_incoming
*
ibinc
;
struct
rds_page_frag
*
frag
;
struct
rds_page_frag
*
pos
;
struct
rds_ib_connection
*
ic
=
inc
->
i_conn
->
c_transport_data
;
ibinc
=
container_of
(
inc
,
struct
rds_ib_incoming
,
ii_inc
);
/* Free attached frags */
list_for_each_entry_safe
(
frag
,
pos
,
&
ibinc
->
ii_frags
,
f_item
)
{
list_del_init
(
&
frag
->
f_item
);
rds_ib_frag_free
(
ic
,
frag
);
}
BUG_ON
(
!
list_empty
(
&
ibinc
->
ii_frags
));
rdsdebug
(
"freeing ibinc %p inc %p
\n
"
,
ibinc
,
inc
);
rds_ib_recv_cache_put
(
&
ibinc
->
ii_cache_entry
,
&
ic
->
i_cache_incs
);
}
static
void
rds_ib_recv_clear_one
(
struct
rds_ib_connection
*
ic
,
...
...
@@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
recv
->
r_ibinc
=
NULL
;
}
if
(
recv
->
r_frag
)
{
rds_ib_recv_unmap_page
(
ic
,
recv
);
if
(
recv
->
r_frag
->
f_page
)
rds_ib_frag_drop_page
(
recv
->
r_frag
);
rds_ib_frag_free
(
recv
->
r_frag
);
ib_dma_unmap_sg
(
ic
->
i_cm_id
->
device
,
&
recv
->
r_frag
->
f_sg
,
1
,
DMA_FROM_DEVICE
);
rds_ib_frag_free
(
ic
,
recv
->
r_frag
);
recv
->
r_frag
=
NULL
;
}
}
...
...
@@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
for
(
i
=
0
;
i
<
ic
->
i_recv_ring
.
w_nr
;
i
++
)
rds_ib_recv_clear_one
(
ic
,
&
ic
->
i_recvs
[
i
]);
if
(
ic
->
i_frag
.
f_page
)
rds_ib_frag_drop_page
(
&
ic
->
i_frag
);
}
static
int
rds_ib_recv_refill_one
(
struct
rds_connection
*
conn
,
struct
rds_ib_recv_work
*
recv
,
gfp_t
kptr_gfp
,
gfp_t
page_gfp
)
static
struct
rds_ib_incoming
*
rds_ib_refill_one_inc
(
struct
rds_ib_connection
*
ic
,
gfp_t
slab_mask
)
{
struct
rds_ib_connection
*
ic
=
conn
->
c_transport_data
;
dma_addr_t
dma_addr
;
struct
ib_sge
*
sge
;
int
ret
=
-
ENOMEM
;
struct
rds_ib_incoming
*
ibinc
;
struct
list_head
*
cache_item
;
int
avail_allocs
;
if
(
recv
->
r_ibinc
==
NULL
)
{
if
(
!
atomic_add_unless
(
&
rds_ib_allocation
,
1
,
rds_ib_sysctl_max_recv_allocation
))
{
cache_item
=
rds_ib_recv_cache_get
(
&
ic
->
i_cache_incs
);
if
(
cache_item
)
{
ibinc
=
container_of
(
cache_item
,
struct
rds_ib_incoming
,
ii_cache_entry
);
}
else
{
avail_allocs
=
atomic_add_unless
(
&
rds_ib_allocation
,
1
,
rds_ib_sysctl_max_recv_allocation
);
if
(
!
avail_allocs
)
{
rds_ib_stats_inc
(
s_ib_rx_alloc_limit
);
goto
out
;
return
NULL
;
}
recv
->
r_ibinc
=
kmem_cache_alloc
(
rds_ib_incoming_slab
,
kptr_gfp
);
if
(
recv
->
r_ibinc
==
NULL
)
{
ibinc
=
kmem_cache_alloc
(
rds_ib_incoming_slab
,
slab_mask
);
if
(
!
ibinc
)
{
atomic_dec
(
&
rds_ib_allocation
);
goto
out
;
return
NULL
;
}
INIT_LIST_HEAD
(
&
recv
->
r_ibinc
->
ii_frags
);
rds_inc_init
(
&
recv
->
r_ibinc
->
ii_inc
,
conn
,
conn
->
c_faddr
);
}
INIT_LIST_HEAD
(
&
ibinc
->
ii_frags
);
rds_inc_init
(
&
ibinc
->
ii_inc
,
ic
->
conn
,
ic
->
conn
->
c_faddr
);
if
(
recv
->
r_frag
==
NULL
)
{
recv
->
r_frag
=
kmem_cache_alloc
(
rds_ib_frag_slab
,
kptr_gfp
);
if
(
recv
->
r_frag
==
NULL
)
goto
out
;
INIT_LIST_HEAD
(
&
recv
->
r_frag
->
f_item
);
recv
->
r_frag
->
f_page
=
NULL
;
return
ibinc
;
}
static
struct
rds_page_frag
*
rds_ib_refill_one_frag
(
struct
rds_ib_connection
*
ic
,
gfp_t
slab_mask
,
gfp_t
page_mask
)
{
struct
rds_page_frag
*
frag
;
struct
list_head
*
cache_item
;
int
ret
;
cache_item
=
rds_ib_recv_cache_get
(
&
ic
->
i_cache_frags
);
if
(
cache_item
)
{
frag
=
container_of
(
cache_item
,
struct
rds_page_frag
,
f_cache_entry
);
}
else
{
frag
=
kmem_cache_alloc
(
rds_ib_frag_slab
,
slab_mask
);
if
(
!
frag
)
return
NULL
;
sg_init_table
(
&
frag
->
f_sg
,
1
);
ret
=
rds_page_remainder_alloc
(
&
frag
->
f_sg
,
RDS_FRAG_SIZE
,
page_mask
);
if
(
ret
)
{
kmem_cache_free
(
rds_ib_frag_slab
,
frag
);
return
NULL
;
}
}
if
(
ic
->
i_frag
.
f_page
==
NULL
)
{
ic
->
i_frag
.
f_page
=
alloc_page
(
page_gfp
);
if
(
ic
->
i_frag
.
f_page
==
NULL
)
goto
out
;
ic
->
i_frag
.
f_offset
=
0
;
INIT_LIST_HEAD
(
&
frag
->
f_item
);
return
frag
;
}
static
int
rds_ib_recv_refill_one
(
struct
rds_connection
*
conn
,
struct
rds_ib_recv_work
*
recv
,
int
prefill
)
{
struct
rds_ib_connection
*
ic
=
conn
->
c_transport_data
;
struct
ib_sge
*
sge
;
int
ret
=
-
ENOMEM
;
gfp_t
slab_mask
=
GFP_NOWAIT
;
gfp_t
page_mask
=
GFP_NOWAIT
;
if
(
prefill
)
{
slab_mask
=
GFP_KERNEL
;
page_mask
=
GFP_HIGHUSER
;
}
dma_addr
=
ib_dma_map_page
(
ic
->
i_cm_id
->
device
,
ic
->
i_frag
.
f_page
,
ic
->
i_frag
.
f_offset
,
RDS_FRAG_SIZE
,
DMA_FROM_DEVICE
);
if
(
ib_dma_mapping_error
(
ic
->
i_cm_id
->
device
,
dma_addr
))
goto
out
;
if
(
!
ic
->
i_cache_incs
.
ready
)
rds_ib_cache_xfer_to_ready
(
&
ic
->
i_cache_incs
);
if
(
!
ic
->
i_cache_frags
.
ready
)
rds_ib_cache_xfer_to_ready
(
&
ic
->
i_cache_frags
);
/*
* Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
* must be called on this recv. This happens as completions hit
* in order or on connection shutdown.
* ibinc was taken from recv if recv contained the start of a message.
* recvs that were continuations will still have this allocated.
*/
recv
->
r_frag
->
f_page
=
ic
->
i_frag
.
f_page
;
recv
->
r_frag
->
f_offset
=
ic
->
i_frag
.
f_offset
;
recv
->
r_frag
->
f_mapped
=
dma_addr
;
if
(
!
recv
->
r_ibinc
)
{
recv
->
r_ibinc
=
rds_ib_refill_one_inc
(
ic
,
slab_mask
);
if
(
!
recv
->
r_ibinc
)
goto
out
;
}
sge
=
rds_ib_data_sge
(
ic
,
recv
->
r_sge
);
sge
->
addr
=
dma_addr
;
sge
->
length
=
RDS_FRAG_SIZE
;
WARN_ON
(
recv
->
r_frag
);
/* leak! */
recv
->
r_frag
=
rds_ib_refill_one_frag
(
ic
,
slab_mask
,
page_mask
);
if
(
!
recv
->
r_frag
)
goto
out
;
ret
=
ib_dma_map_sg
(
ic
->
i_cm_id
->
device
,
&
recv
->
r_frag
->
f_sg
,
1
,
DMA_FROM_DEVICE
);
WARN_ON
(
ret
!=
1
);
sge
=
rds_ib_header_sge
(
ic
,
recv
->
r_sge
)
;
sge
=
&
recv
->
r_sge
[
0
]
;
sge
->
addr
=
ic
->
i_recv_hdrs_dma
+
(
recv
-
ic
->
i_recvs
)
*
sizeof
(
struct
rds_header
);
sge
->
length
=
sizeof
(
struct
rds_header
);
get_page
(
recv
->
r_frag
->
f_page
);
if
(
ic
->
i_frag
.
f_offset
<
RDS_PAGE_LAST_OFF
)
{
ic
->
i_frag
.
f_offset
+=
RDS_FRAG_SIZE
;
}
else
{
put_page
(
ic
->
i_frag
.
f_page
);
ic
->
i_frag
.
f_page
=
NULL
;
ic
->
i_frag
.
f_offset
=
0
;
}
sge
=
&
recv
->
r_sge
[
1
];
sge
->
addr
=
sg_dma_address
(
&
recv
->
r_frag
->
f_sg
);
sge
->
length
=
sg_dma_len
(
&
recv
->
r_frag
->
f_sg
);
ret
=
0
;
out:
...
...
@@ -216,13 +350,11 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
/*
* This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into
* sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
* pairs don't go unmatched.
* sockets.
*
* -1 is returned if posting fails due to temporary resource exhaustion.
*/
int
rds_ib_recv_refill
(
struct
rds_connection
*
conn
,
gfp_t
kptr_gfp
,
gfp_t
page_gfp
,
int
prefill
)
void
rds_ib_recv_refill
(
struct
rds_connection
*
conn
,
int
prefill
)
{
struct
rds_ib_connection
*
ic
=
conn
->
c_transport_data
;
struct
rds_ib_recv_work
*
recv
;
...
...
@@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
if
(
pos
>=
ic
->
i_recv_ring
.
w_nr
)
{
printk
(
KERN_NOTICE
"Argh - ring alloc returned pos=%u
\n
"
,
pos
);
ret
=
-
EINVAL
;
break
;
}
recv
=
&
ic
->
i_recvs
[
pos
];
ret
=
rds_ib_recv_refill_one
(
conn
,
recv
,
kptr_gfp
,
page_gfp
);
ret
=
rds_ib_recv_refill_one
(
conn
,
recv
,
prefill
);
if
(
ret
)
{
ret
=
-
1
;
break
;
}
/* XXX when can this fail? */
ret
=
ib_post_recv
(
ic
->
i_cm_id
->
qp
,
&
recv
->
r_wr
,
&
failed_wr
);
rdsdebug
(
"recv %p ibinc %p page %p addr %lu ret %d
\n
"
,
recv
,
recv
->
r_ibinc
,
recv
->
r_frag
->
f_page
,
(
long
)
recv
->
r_frag
->
f_mapped
,
ret
);
recv
->
r_ibinc
,
sg_page
(
&
recv
->
r_frag
->
f_sg
)
,
(
long
)
sg_dma_address
(
&
recv
->
r_frag
->
f_sg
)
,
ret
);
if
(
ret
)
{
rds_ib_conn_error
(
conn
,
"recv post on "
"%pI4 returned %d, disconnecting and "
"reconnecting
\n
"
,
&
conn
->
c_faddr
,
ret
);
ret
=
-
1
;
break
;
}
...
...
@@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
if
(
ret
)
rds_ib_ring_unalloc
(
&
ic
->
i_recv_ring
,
1
);
return
ret
;
}
void
rds_ib_inc_purge
(
struct
rds_incoming
*
inc
)
/*
* We want to recycle several types of recv allocations, like incs and frags.
* To use this, the *_free() function passes in the ptr to a list_head within
* the recyclee, as well as the cache to put it on.
*
* First, we put the memory on a percpu list. When this reaches a certain size,
* We move it to an intermediate non-percpu list in a lockless manner, with some
* xchg/compxchg wizardry.
*
* N.B. Instead of a list_head as the anchor, we use a single pointer, which can
* be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
* list_empty() will return true with one element is actually present.
*/
static
void
rds_ib_recv_cache_put
(
struct
list_head
*
new_item
,
struct
rds_ib_refill_cache
*
cache
)
{
struct
rds_ib_incoming
*
ibinc
;
struct
rds_
page_frag
*
frag
;
struct
rds_page_frag
*
pos
;
unsigned
long
flags
;
struct
rds_
ib_cache_head
*
chp
;
struct
list_head
*
old
;
ibinc
=
container_of
(
inc
,
struct
rds_ib_incoming
,
ii_inc
);
rdsdebug
(
"purging ibinc %p inc %p
\n
"
,
ibinc
,
inc
);
local_irq_save
(
flags
);
list_for_each_entry_safe
(
frag
,
pos
,
&
ibinc
->
ii_frags
,
f_item
)
{
list_del_init
(
&
frag
->
f_item
);
rds_ib_frag_drop_page
(
frag
);
rds_ib_frag_free
(
frag
);
}
chp
=
per_cpu_ptr
(
cache
->
percpu
,
smp_processor_id
());
if
(
!
chp
->
first
)
INIT_LIST_HEAD
(
new_item
);
else
/* put on front */
list_add_tail
(
new_item
,
chp
->
first
);
chp
->
first
=
new_item
;
chp
->
count
++
;
if
(
chp
->
count
<
RDS_IB_RECYCLE_BATCH_COUNT
)
goto
end
;
/*
* Return our per-cpu first list to the cache's xfer by atomically
* grabbing the current xfer list, appending it to our per-cpu list,
* and then atomically returning that entire list back to the
* cache's xfer list as long as it's still empty.
*/
do
{
old
=
xchg
(
&
cache
->
xfer
,
NULL
);
if
(
old
)
list_splice_entire_tail
(
old
,
chp
->
first
);
old
=
cmpxchg
(
&
cache
->
xfer
,
NULL
,
chp
->
first
);
}
while
(
old
);
chp
->
first
=
NULL
;
chp
->
count
=
0
;
end:
local_irq_restore
(
flags
);
}
void
rds_ib_inc_free
(
struct
rds_incoming
*
inc
)
static
struct
list_head
*
rds_ib_recv_cache_get
(
struct
rds_ib_refill_cache
*
cache
)
{
struct
rds_ib_incoming
*
ibinc
;
ibinc
=
container_of
(
inc
,
struct
rds_ib_incoming
,
ii_inc
);
struct
list_head
*
head
=
cache
->
ready
;
if
(
head
)
{
if
(
!
list_empty
(
head
))
{
cache
->
ready
=
head
->
next
;
list_del_init
(
head
);
}
else
cache
->
ready
=
NULL
;
}
rds_ib_inc_purge
(
inc
);
rdsdebug
(
"freeing ibinc %p inc %p
\n
"
,
ibinc
,
inc
);
BUG_ON
(
!
list_empty
(
&
ibinc
->
ii_frags
));
kmem_cache_free
(
rds_ib_incoming_slab
,
ibinc
);
atomic_dec
(
&
rds_ib_allocation
);
BUG_ON
(
atomic_read
(
&
rds_ib_allocation
)
<
0
);
return
head
;
}
int
rds_ib_inc_copy_to_user
(
struct
rds_incoming
*
inc
,
struct
iovec
*
first_iov
,
...
...
@@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
to_copy
=
min_t
(
unsigned
long
,
to_copy
,
len
-
copied
);
rdsdebug
(
"%lu bytes to user [%p, %zu] + %lu from frag "
"[%p, %
l
u] + %lu
\n
"
,
"[%p, %u] + %lu
\n
"
,
to_copy
,
iov
->
iov_base
,
iov
->
iov_len
,
iov_off
,
frag
->
f_page
,
frag
->
f_
offset
,
frag_off
);
sg_page
(
&
frag
->
f_sg
),
frag
->
f_sg
.
offset
,
frag_off
);
/* XXX needs + offset for multiple recvs per page */
ret
=
rds_page_copy_to_user
(
frag
->
f_page
,
frag
->
f_offset
+
frag_off
,
ret
=
rds_page_copy_to_user
(
sg_page
(
&
frag
->
f_sg
)
,
frag
->
f_
sg
.
offset
+
frag_off
,
iov
->
iov_base
+
iov_off
,
to_copy
);
if
(
ret
)
{
...
...
@@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
return
rds_ib_get_ack
(
ic
);
}
static
struct
rds_header
*
rds_ib_get_header
(
struct
rds_connection
*
conn
,
struct
rds_ib_recv_work
*
recv
,
u32
data_len
)
{
struct
rds_ib_connection
*
ic
=
conn
->
c_transport_data
;
void
*
hdr_buff
=
&
ic
->
i_recv_hdrs
[
recv
-
ic
->
i_recvs
];
void
*
addr
;
u32
misplaced_hdr_bytes
;
/*
* Support header at the front (RDS 3.1+) as well as header-at-end.
*
* Cases:
* 1) header all in header buff (great!)
* 2) header all in data page (copy all to header buff)
* 3) header split across hdr buf + data page
* (move bit in hdr buff to end before copying other bit from data page)
*/
if
(
conn
->
c_version
>
RDS_PROTOCOL_3_0
||
data_len
==
RDS_FRAG_SIZE
)
return
hdr_buff
;
if
(
data_len
<=
(
RDS_FRAG_SIZE
-
sizeof
(
struct
rds_header
)))
{
addr
=
kmap_atomic
(
recv
->
r_frag
->
f_page
,
KM_SOFTIRQ0
);
memcpy
(
hdr_buff
,
addr
+
recv
->
r_frag
->
f_offset
+
data_len
,
sizeof
(
struct
rds_header
));
kunmap_atomic
(
addr
,
KM_SOFTIRQ0
);
return
hdr_buff
;
}
misplaced_hdr_bytes
=
(
sizeof
(
struct
rds_header
)
-
(
RDS_FRAG_SIZE
-
data_len
));
memmove
(
hdr_buff
+
misplaced_hdr_bytes
,
hdr_buff
,
misplaced_hdr_bytes
);
addr
=
kmap_atomic
(
recv
->
r_frag
->
f_page
,
KM_SOFTIRQ0
);
memcpy
(
hdr_buff
,
addr
+
recv
->
r_frag
->
f_offset
+
data_len
,
sizeof
(
struct
rds_header
)
-
misplaced_hdr_bytes
);
kunmap_atomic
(
addr
,
KM_SOFTIRQ0
);
return
hdr_buff
;
}
/*
* It's kind of lame that we're copying from the posted receive pages into
* long-lived bitmaps. We could have posted the bitmaps and rdma written into
...
...
@@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
to_copy
=
min
(
RDS_FRAG_SIZE
-
frag_off
,
PAGE_SIZE
-
map_off
);
BUG_ON
(
to_copy
&
7
);
/* Must be 64bit aligned. */
addr
=
kmap_atomic
(
frag
->
f_page
,
KM_SOFTIRQ0
);
addr
=
kmap_atomic
(
sg_page
(
&
frag
->
f_sg
)
,
KM_SOFTIRQ0
);
src
=
addr
+
frag_off
;
dst
=
(
void
*
)
map
->
m_page_addrs
[
map_page
]
+
map_off
;
...
...
@@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
}
data_len
-=
sizeof
(
struct
rds_header
);
ihdr
=
rds_ib_get_header
(
conn
,
recv
,
data_len
)
;
ihdr
=
&
ic
->
i_recv_hdrs
[
recv
-
ic
->
i_recvs
]
;
/* Validate the checksum. */
if
(
!
rds_message_verify_checksum
(
ihdr
))
{
...
...
@@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
* the inc is freed. We don't go that route, so we have to drop the
* page ref ourselves. We can't just leave the page on the recv
* because that confuses the dma mapping of pages and each recv's use
* of a partial page. We can leave the frag, though, it will be
* reused.
* of a partial page.
*
* FIXME: Fold this into the code path below.
*/
rds_ib_frag_drop_page
(
recv
->
r_frag
);
rds_ib_frag_free
(
ic
,
recv
->
r_frag
);
recv
->
r_frag
=
NULL
;
return
;
}
...
...
@@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
* into the inc and save the inc so we can hang upcoming fragments
* off its list.
*/
if
(
ibinc
==
NULL
)
{
if
(
!
ibinc
)
{
ibinc
=
recv
->
r_ibinc
;
recv
->
r_ibinc
=
NULL
;
ic
->
i_ibinc
=
ibinc
;
...
...
@@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic,
struct
rds_ib_recv_work
*
recv
;
while
(
ib_poll_cq
(
ic
->
i_recv_cq
,
1
,
&
wc
)
>
0
)
{
rdsdebug
(
"wc wr_id 0x%llx status %u byte_len %u imm_data %u
\n
"
,
(
unsigned
long
long
)
wc
.
wr_id
,
wc
.
status
,
wc
.
byte_len
,
rdsdebug
(
"wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u
\n
"
,
(
unsigned
long
long
)
wc
.
wr_id
,
wc
.
status
,
rds_ib_wc_status_str
(
wc
.
status
),
wc
.
byte_len
,
be32_to_cpu
(
wc
.
ex
.
imm_data
));
rds_ib_stats_inc
(
s_ib_rx_cq_event
);
recv
=
&
ic
->
i_recvs
[
rds_ib_ring_oldest
(
&
ic
->
i_recv_ring
)];
rds_ib_recv_unmap_page
(
ic
,
recv
);
ib_dma_unmap_sg
(
ic
->
i_cm_id
->
device
,
&
recv
->
r_frag
->
f_sg
,
1
,
DMA_FROM_DEVICE
);
/*
* Also process recvs in connecting state because it is possible
* to get a recv completion _before_ the rdmacm ESTABLISHED
* event is processed.
*/
if
(
rds_conn_up
(
conn
)
||
rds_conn_connecting
(
conn
))
{
if
(
wc
.
status
==
IB_WC_SUCCESS
)
{
rds_ib_process_recv
(
conn
,
recv
,
wc
.
byte_len
,
state
);
}
else
{
/* We expect errors as the qp is drained during shutdown */
if
(
wc
.
status
==
IB_WC_SUCCESS
)
{
rds_ib_process_recv
(
conn
,
recv
,
wc
.
byte_len
,
state
);
}
else
{
rds_ib_conn_error
(
conn
,
"recv completion on "
"%pI4 had status %u, disconnecting and "
"reconnecting
\n
"
,
&
conn
->
c_faddr
,
wc
.
status
);
}
if
(
rds_conn_up
(
conn
)
||
rds_conn_connecting
(
conn
))
rds_ib_conn_error
(
conn
,
"recv completion on %pI4 had "
"status %u (%s), disconnecting and "
"reconnecting
\n
"
,
&
conn
->
c_faddr
,
wc
.
status
,
rds_ib_wc_status_str
(
wc
.
status
));
}
/*
* It's very important that we only free this ring entry if we've truly
* freed the resources allocated to the entry. The refilling path can
* leak if we don't.
*/
rds_ib_ring_free
(
&
ic
->
i_recv_ring
,
1
);
}
}
...
...
@@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
if
(
rds_ib_ring_empty
(
&
ic
->
i_recv_ring
))
rds_ib_stats_inc
(
s_ib_rx_ring_empty
);
/*
* If the ring is running low, then schedule the thread to refill.
*/
if
(
rds_ib_ring_low
(
&
ic
->
i_recv_ring
))
queue_delayed_work
(
rds_wq
,
&
conn
->
c_recv_w
,
0
);
rds_ib_recv_refill
(
conn
,
0
);
}
int
rds_ib_recv
(
struct
rds_connection
*
conn
)
...
...
@@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn)
int
ret
=
0
;
rdsdebug
(
"conn %p
\n
"
,
conn
);
/*
* If we get a temporary posting failure in this context then
* we're really low and we want the caller to back off for a bit.
*/
mutex_lock
(
&
ic
->
i_recv_mutex
);
if
(
rds_ib_recv_refill
(
conn
,
GFP_KERNEL
,
GFP_HIGHUSER
,
0
))
ret
=
-
ENOMEM
;
else
rds_ib_stats_inc
(
s_ib_rx_refill_from_thread
);
mutex_unlock
(
&
ic
->
i_recv_mutex
);
if
(
rds_conn_up
(
conn
))
rds_ib_attempt_ack
(
ic
);
return
ret
;
}
int
__init
rds_ib_recv_init
(
void
)
int
rds_ib_recv_init
(
void
)
{
struct
sysinfo
si
;
int
ret
=
-
ENOMEM
;
...
...
@@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void)
rds_ib_incoming_slab
=
kmem_cache_create
(
"rds_ib_incoming"
,
sizeof
(
struct
rds_ib_incoming
),
0
,
0
,
NULL
);
if
(
rds_ib_incoming_slab
==
NULL
)
0
,
SLAB_HWCACHE_ALIGN
,
NULL
);
if
(
!
rds_ib_incoming_slab
)
goto
out
;
rds_ib_frag_slab
=
kmem_cache_create
(
"rds_ib_frag"
,
sizeof
(
struct
rds_page_frag
),
0
,
0
,
NULL
);
if
(
rds_ib_frag_slab
==
NULL
)
0
,
SLAB_HWCACHE_ALIGN
,
NULL
);
if
(
!
rds_ib_frag_slab
)
kmem_cache_destroy
(
rds_ib_incoming_slab
);
else
ret
=
0
;
...
...
net/rds/ib_send.c
View file @
cf0ac2b8
...
...
@@ -36,11 +36,49 @@
#include <linux/dmapool.h>
#include "rds.h"
#include "rdma.h"
#include "ib.h"
static
void
rds_ib_send_rdma_complete
(
struct
rds_message
*
rm
,
int
wc_status
)
static
char
*
rds_ib_wc_status_strings
[]
=
{
#define RDS_IB_WC_STATUS_STR(foo) \
[IB_WC_##foo] = __stringify(IB_WC_##foo)
RDS_IB_WC_STATUS_STR
(
SUCCESS
),
RDS_IB_WC_STATUS_STR
(
LOC_LEN_ERR
),
RDS_IB_WC_STATUS_STR
(
LOC_QP_OP_ERR
),
RDS_IB_WC_STATUS_STR
(
LOC_EEC_OP_ERR
),
RDS_IB_WC_STATUS_STR
(
LOC_PROT_ERR
),
RDS_IB_WC_STATUS_STR
(
WR_FLUSH_ERR
),
RDS_IB_WC_STATUS_STR
(
MW_BIND_ERR
),
RDS_IB_WC_STATUS_STR
(
BAD_RESP_ERR
),
RDS_IB_WC_STATUS_STR
(
LOC_ACCESS_ERR
),
RDS_IB_WC_STATUS_STR
(
REM_INV_REQ_ERR
),
RDS_IB_WC_STATUS_STR
(
REM_ACCESS_ERR
),
RDS_IB_WC_STATUS_STR
(
REM_OP_ERR
),
RDS_IB_WC_STATUS_STR
(
RETRY_EXC_ERR
),
RDS_IB_WC_STATUS_STR
(
RNR_RETRY_EXC_ERR
),
RDS_IB_WC_STATUS_STR
(
LOC_RDD_VIOL_ERR
),
RDS_IB_WC_STATUS_STR
(
REM_INV_RD_REQ_ERR
),
RDS_IB_WC_STATUS_STR
(
REM_ABORT_ERR
),
RDS_IB_WC_STATUS_STR
(
INV_EECN_ERR
),
RDS_IB_WC_STATUS_STR
(
INV_EEC_STATE_ERR
),
RDS_IB_WC_STATUS_STR
(
FATAL_ERR
),
RDS_IB_WC_STATUS_STR
(
RESP_TIMEOUT_ERR
),
RDS_IB_WC_STATUS_STR
(
GENERAL_ERR
),
#undef RDS_IB_WC_STATUS_STR
};
char
*
rds_ib_wc_status_str
(
enum
ib_wc_status
status
)
{
return
rds_str_array
(
rds_ib_wc_status_strings
,
ARRAY_SIZE
(
rds_ib_wc_status_strings
),
status
);
}
/*
* Convert IB-specific error message to RDS error message and call core
* completion handler.
*/
static
void
rds_ib_send_complete
(
struct
rds_message
*
rm
,
int
wc_status
,
void
(
*
complete
)(
struct
rds_message
*
rm
,
int
status
))
{
int
notify_status
;
...
...
@@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm,
notify_status
=
RDS_RDMA_OTHER_ERROR
;
break
;
}
rds_rdma_send_complete
(
rm
,
notify_status
);
complete
(
rm
,
notify_status
);
}
static
void
rds_ib_send_unmap_data
(
struct
rds_ib_connection
*
ic
,
struct
rm_data_op
*
op
,
int
wc_status
)
{
if
(
op
->
op_nents
)
ib_dma_unmap_sg
(
ic
->
i_cm_id
->
device
,
op
->
op_sg
,
op
->
op_nents
,
DMA_TO_DEVICE
);
}
static
void
rds_ib_send_unmap_rdma
(
struct
rds_ib_connection
*
ic
,
struct
rds_rdma_op
*
op
)
struct
rm_rdma_op
*
op
,
int
wc_status
)
{
if
(
op
->
r
_mapped
)
{
if
(
op
->
op
_mapped
)
{
ib_dma_unmap_sg
(
ic
->
i_cm_id
->
device
,
op
->
r_sg
,
op
->
r
_nents
,
op
->
r
_write
?
DMA_TO_DEVICE
:
DMA_FROM_DEVICE
);
op
->
r
_mapped
=
0
;
op
->
op_sg
,
op
->
op
_nents
,
op
->
op
_write
?
DMA_TO_DEVICE
:
DMA_FROM_DEVICE
);
op
->
op
_mapped
=
0
;
}
/* If the user asked for a completion notification on this
* message, we can implement three different semantics:
* 1. Notify when we received the ACK on the RDS message
* that was queued with the RDMA. This provides reliable
* notification of RDMA status at the expense of a one-way
* packet delay.
* 2. Notify when the IB stack gives us the completion event for
* the RDMA operation.
* 3. Notify when the IB stack gives us the completion event for
* the accompanying RDS messages.
* Here, we implement approach #3. To implement approach #2,
* we would need to take an event for the rdma WR. To implement #1,
* don't call rds_rdma_send_complete at all, and fall back to the notify
* handling in the ACK processing code.
*
* Note: There's no need to explicitly sync any RDMA buffers using
* ib_dma_sync_sg_for_cpu - the completion for the RDMA
* operation itself unmapped the RDMA buffers, which takes care
* of synching.
*/
rds_ib_send_complete
(
container_of
(
op
,
struct
rds_message
,
rdma
),
wc_status
,
rds_rdma_send_complete
);
if
(
op
->
op_write
)
rds_stats_add
(
s_send_rdma_bytes
,
op
->
op_bytes
);
else
rds_stats_add
(
s_recv_rdma_bytes
,
op
->
op_bytes
);
}
static
void
rds_ib_send_unmap_
rm
(
struct
rds_ib_connection
*
ic
,
struct
rds_ib_send_work
*
send
,
int
wc_status
)
static
void
rds_ib_send_unmap_
atomic
(
struct
rds_ib_connection
*
ic
,
struct
rm_atomic_op
*
op
,
int
wc_status
)
{
struct
rds_message
*
rm
=
send
->
s_rm
;
rdsdebug
(
"ic %p send %p rm %p
\n
"
,
ic
,
send
,
rm
);
ib_dma_unmap_sg
(
ic
->
i_cm_id
->
device
,
rm
->
m_sg
,
rm
->
m_nents
,
DMA_TO_DEVICE
);
if
(
rm
->
m_rdma_op
!=
NULL
)
{
rds_ib_send_unmap_rdma
(
ic
,
rm
->
m_rdma_op
);
/* If the user asked for a completion notification on this
* message, we can implement three different semantics:
* 1. Notify when we received the ACK on the RDS message
* that was queued with the RDMA. This provides reliable
* notification of RDMA status at the expense of a one-way
* packet delay.
* 2. Notify when the IB stack gives us the completion event for
* the RDMA operation.
* 3. Notify when the IB stack gives us the completion event for
* the accompanying RDS messages.
* Here, we implement approach #3. To implement approach #2,
* call rds_rdma_send_complete from the cq_handler. To implement #1,
* don't call rds_rdma_send_complete at all, and fall back to the notify
* handling in the ACK processing code.
*
* Note: There's no need to explicitly sync any RDMA buffers using
* ib_dma_sync_sg_for_cpu - the completion for the RDMA
* operation itself unmapped the RDMA buffers, which takes care
* of synching.
*/
rds_ib_send_rdma_complete
(
rm
,
wc_status
);
/* unmap atomic recvbuf */
if
(
op
->
op_mapped
)
{
ib_dma_unmap_sg
(
ic
->
i_cm_id
->
device
,
op
->
op_sg
,
1
,
DMA_FROM_DEVICE
);
op
->
op_mapped
=
0
;
}
if
(
rm
->
m_rdma_op
->
r_write
)
rds_stats_add
(
s_send_rdma_bytes
,
rm
->
m_rdma_op
->
r_bytes
);
else
rds_stats_add
(
s_recv_rdma_bytes
,
rm
->
m_rdma_op
->
r_bytes
);
rds_ib_send_complete
(
container_of
(
op
,
struct
rds_message
,
atomic
),
wc_status
,
rds_atomic_send_complete
);
if
(
op
->
op_type
==
RDS_ATOMIC_TYPE_CSWP
)
rds_ib_stats_inc
(
s_ib_atomic_cswp
);
else
rds_ib_stats_inc
(
s_ib_atomic_fadd
);
}
/*
* Unmap the resources associated with a struct send_work.
*
* Returns the rm for no good reason other than it is unobtainable
* other than by switching on wr.opcode, currently, and the caller,
* the event handler, needs it.
*/
static
struct
rds_message
*
rds_ib_send_unmap_op
(
struct
rds_ib_connection
*
ic
,
struct
rds_ib_send_work
*
send
,
int
wc_status
)
{
struct
rds_message
*
rm
=
NULL
;
/* In the error case, wc.opcode sometimes contains garbage */
switch
(
send
->
s_wr
.
opcode
)
{
case
IB_WR_SEND
:
if
(
send
->
s_op
)
{
rm
=
container_of
(
send
->
s_op
,
struct
rds_message
,
data
);
rds_ib_send_unmap_data
(
ic
,
send
->
s_op
,
wc_status
);
}
break
;
case
IB_WR_RDMA_WRITE
:
case
IB_WR_RDMA_READ
:
if
(
send
->
s_op
)
{
rm
=
container_of
(
send
->
s_op
,
struct
rds_message
,
rdma
);
rds_ib_send_unmap_rdma
(
ic
,
send
->
s_op
,
wc_status
);
}
break
;
case
IB_WR_ATOMIC_FETCH_AND_ADD
:
case
IB_WR_ATOMIC_CMP_AND_SWP
:
if
(
send
->
s_op
)
{
rm
=
container_of
(
send
->
s_op
,
struct
rds_message
,
atomic
);
rds_ib_send_unmap_atomic
(
ic
,
send
->
s_op
,
wc_status
);
}
break
;
default:
if
(
printk_ratelimit
())
printk
(
KERN_NOTICE
"RDS/IB: %s: unexpected opcode 0x%x in WR!
\n
"
,
__func__
,
send
->
s_wr
.
opcode
);
break
;
}
/* If anyone waited for this message to get flushed out, wake
* them up now */
rds_message_unmapped
(
rm
);
send
->
s_wr
.
opcode
=
0xdead
;
rds_message_put
(
rm
);
send
->
s_rm
=
NULL
;
return
rm
;
}
void
rds_ib_send_init_ring
(
struct
rds_ib_connection
*
ic
)
...
...
@@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
for
(
i
=
0
,
send
=
ic
->
i_sends
;
i
<
ic
->
i_send_ring
.
w_nr
;
i
++
,
send
++
)
{
struct
ib_sge
*
sge
;
send
->
s_rm
=
NULL
;
send
->
s_op
=
NULL
;
send
->
s_wr
.
wr_id
=
i
;
send
->
s_wr
.
sg_list
=
send
->
s_sge
;
send
->
s_wr
.
num_sge
=
1
;
send
->
s_wr
.
opcode
=
IB_WR_SEND
;
send
->
s_wr
.
send_flags
=
0
;
send
->
s_wr
.
ex
.
imm_data
=
0
;
sge
=
rds_ib_data_sge
(
ic
,
send
->
s_sge
);
sge
->
lkey
=
ic
->
i_mr
->
lkey
;
sge
=
rds_ib_header_sge
(
ic
,
send
->
s_sge
);
sge
=
&
send
->
s_sge
[
0
];
sge
->
addr
=
ic
->
i_send_hdrs_dma
+
(
i
*
sizeof
(
struct
rds_header
));
sge
->
length
=
sizeof
(
struct
rds_header
);
sge
->
lkey
=
ic
->
i_mr
->
lkey
;
send
->
s_sge
[
1
].
lkey
=
ic
->
i_mr
->
lkey
;
}
}
...
...
@@ -159,15 +248,23 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
u32
i
;
for
(
i
=
0
,
send
=
ic
->
i_sends
;
i
<
ic
->
i_send_ring
.
w_nr
;
i
++
,
send
++
)
{
if
(
send
->
s_wr
.
opcode
==
0xdead
)
continue
;
if
(
send
->
s_rm
)
rds_ib_send_unmap_rm
(
ic
,
send
,
IB_WC_WR_FLUSH_ERR
);
if
(
send
->
s_op
)
rds_ib_send_unmap_rdma
(
ic
,
send
->
s_op
);
if
(
send
->
s_op
&&
send
->
s_wr
.
opcode
!=
0xdead
)
rds_ib_send_unmap_op
(
ic
,
send
,
IB_WC_WR_FLUSH_ERR
);
}
}
/*
* The only fast path caller always has a non-zero nr, so we don't
* bother testing nr before performing the atomic sub.
*/
static
void
rds_ib_sub_signaled
(
struct
rds_ib_connection
*
ic
,
int
nr
)
{
if
((
atomic_sub_return
(
nr
,
&
ic
->
i_signaled_sends
)
==
0
)
&&
waitqueue_active
(
&
rds_ib_ring_empty_wait
))
wake_up
(
&
rds_ib_ring_empty_wait
);
BUG_ON
(
atomic_read
(
&
ic
->
i_signaled_sends
)
<
0
);
}
/*
* The _oldest/_free ring operations here race cleanly with the alloc/unalloc
* operations performed in the send path. As the sender allocs and potentially
...
...
@@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
{
struct
rds_connection
*
conn
=
context
;
struct
rds_ib_connection
*
ic
=
conn
->
c_transport_data
;
struct
rds_message
*
rm
=
NULL
;
struct
ib_wc
wc
;
struct
rds_ib_send_work
*
send
;
u32
completed
;
u32
oldest
;
u32
i
=
0
;
int
ret
;
int
nr_sig
=
0
;
rdsdebug
(
"cq %p conn %p
\n
"
,
cq
,
conn
);
rds_ib_stats_inc
(
s_ib_tx_cq_call
);
...
...
@@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
rdsdebug
(
"ib_req_notify_cq send failed: %d
\n
"
,
ret
);
while
(
ib_poll_cq
(
cq
,
1
,
&
wc
)
>
0
)
{
rdsdebug
(
"wc wr_id 0x%llx status %u byte_len %u imm_data %u
\n
"
,
(
unsigned
long
long
)
wc
.
wr_id
,
wc
.
status
,
wc
.
byte_len
,
rdsdebug
(
"wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u
\n
"
,
(
unsigned
long
long
)
wc
.
wr_id
,
wc
.
status
,
rds_ib_wc_status_str
(
wc
.
status
),
wc
.
byte_len
,
be32_to_cpu
(
wc
.
ex
.
imm_data
));
rds_ib_stats_inc
(
s_ib_tx_cq_event
);
...
...
@@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
for
(
i
=
0
;
i
<
completed
;
i
++
)
{
send
=
&
ic
->
i_sends
[
oldest
];
if
(
send
->
s_wr
.
send_flags
&
IB_SEND_SIGNALED
)
nr_sig
++
;
/* In the error case, wc.opcode sometimes contains garbage */
switch
(
send
->
s_wr
.
opcode
)
{
case
IB_WR_SEND
:
if
(
send
->
s_rm
)
rds_ib_send_unmap_rm
(
ic
,
send
,
wc
.
status
);
break
;
case
IB_WR_RDMA_WRITE
:
case
IB_WR_RDMA_READ
:
/* Nothing to be done - the SG list will be unmapped
* when the SEND completes. */
break
;
default:
if
(
printk_ratelimit
())
printk
(
KERN_NOTICE
"RDS/IB: %s: unexpected opcode 0x%x in WR!
\n
"
,
__func__
,
send
->
s_wr
.
opcode
);
break
;
}
rm
=
rds_ib_send_unmap_op
(
ic
,
send
,
wc
.
status
);
send
->
s_wr
.
opcode
=
0xdead
;
send
->
s_wr
.
num_sge
=
1
;
if
(
send
->
s_queued
+
HZ
/
2
<
jiffies
)
rds_ib_stats_inc
(
s_ib_tx_stalled
);
/* If a RDMA operation produced an error, signal this right
* away. If we don't, the subsequent SEND that goes with this
* RDMA will be canceled with ERR_WFLUSH, and the application
* never learn that the RDMA failed. */
if
(
unlikely
(
wc
.
status
==
IB_WC_REM_ACCESS_ERR
&&
send
->
s_op
))
{
struct
rds_message
*
rm
;
rm
=
rds_send_get_message
(
conn
,
send
->
s_op
);
if
(
rm
)
{
if
(
rm
->
m_rdma_op
)
rds_ib_send_unmap_rdma
(
ic
,
rm
->
m_rdma_op
);
rds_ib_send_rdma_complete
(
rm
,
wc
.
status
);
rds_message_put
(
rm
);
if
(
send
->
s_op
)
{
if
(
send
->
s_op
==
rm
->
m_final_op
)
{
/* If anyone waited for this message to get flushed out, wake
* them up now */
rds_message_unmapped
(
rm
);
}
rds_message_put
(
rm
);
send
->
s_op
=
NULL
;
}
oldest
=
(
oldest
+
1
)
%
ic
->
i_send_ring
.
w_nr
;
}
rds_ib_ring_free
(
&
ic
->
i_send_ring
,
completed
);
rds_ib_sub_signaled
(
ic
,
nr_sig
);
nr_sig
=
0
;
if
(
test_and_clear_bit
(
RDS_LL_SEND_FULL
,
&
conn
->
c_flags
)
||
test_bit
(
0
,
&
conn
->
c_map_queued
))
...
...
@@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
/* We expect errors as the qp is drained during shutdown */
if
(
wc
.
status
!=
IB_WC_SUCCESS
&&
rds_conn_up
(
conn
))
{
rds_ib_conn_error
(
conn
,
"send completion on %pI4 "
"had status %u, disconnecting and reconnecting
\n
"
,
&
conn
->
c_faddr
,
wc
.
status
);
rds_ib_conn_error
(
conn
,
"send completion on %pI4 had status "
"%u (%s), disconnecting and reconnecting
\n
"
,
&
conn
->
c_faddr
,
wc
.
status
,
rds_ib_wc_status_str
(
wc
.
status
)
);
}
}
}
...
...
@@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
* credits (see rds_ib_send_add_credits below).
*
* The RDS send code is essentially single-threaded; rds_send_xmit
*
grabs c_send_lock
to ensure exclusive access to the send ring.
*
sets RDS_IN_XMIT
to ensure exclusive access to the send ring.
* However, the ACK sending code is independent and can race with
* message SENDs.
*
...
...
@@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
set_bit
(
IB_ACK_REQUESTED
,
&
ic
->
i_ack_flags
);
}
static
inline
void
rds_ib_xmit_populate_wr
(
struct
rds_ib_connection
*
ic
,
struct
rds_ib_send_work
*
send
,
unsigned
int
pos
,
unsigned
long
buffer
,
unsigned
int
length
,
int
send_flags
)
static
inline
int
rds_ib_set_wr_signal_state
(
struct
rds_ib_connection
*
ic
,
struct
rds_ib_send_work
*
send
,
bool
notify
)
{
struct
ib_sge
*
sge
;
WARN_ON
(
pos
!=
send
-
ic
->
i_sends
);
send
->
s_wr
.
send_flags
=
send_flags
;
send
->
s_wr
.
opcode
=
IB_WR_SEND
;
send
->
s_wr
.
num_sge
=
2
;
send
->
s_wr
.
next
=
NULL
;
send
->
s_queued
=
jiffies
;
send
->
s_op
=
NULL
;
if
(
length
!=
0
)
{
sge
=
rds_ib_data_sge
(
ic
,
send
->
s_sge
);
sge
->
addr
=
buffer
;
sge
->
length
=
length
;
sge
->
lkey
=
ic
->
i_mr
->
lkey
;
sge
=
rds_ib_header_sge
(
ic
,
send
->
s_sge
);
}
else
{
/* We're sending a packet with no payload. There is only
* one SGE */
send
->
s_wr
.
num_sge
=
1
;
sge
=
&
send
->
s_sge
[
0
];
/*
* We want to delay signaling completions just enough to get
* the batching benefits but not so much that we create dead time
* on the wire.
*/
if
(
ic
->
i_unsignaled_wrs
--
==
0
||
notify
)
{
ic
->
i_unsignaled_wrs
=
rds_ib_sysctl_max_unsig_wrs
;
send
->
s_wr
.
send_flags
|=
IB_SEND_SIGNALED
;
return
1
;
}
sge
->
addr
=
ic
->
i_send_hdrs_dma
+
(
pos
*
sizeof
(
struct
rds_header
));
sge
->
length
=
sizeof
(
struct
rds_header
);
sge
->
lkey
=
ic
->
i_mr
->
lkey
;
return
0
;
}
/*
...
...
@@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
u32
pos
;
u32
i
;
u32
work_alloc
;
u32
credit_alloc
;
u32
credit_alloc
=
0
;
u32
posted
;
u32
adv_credits
=
0
;
int
send_flags
=
0
;
int
sent
;
int
bytes_sent
=
0
;
int
ret
;
int
flow_controlled
=
0
;
int
nr_sig
=
0
;
BUG_ON
(
off
%
RDS_FRAG_SIZE
);
BUG_ON
(
hdr_off
!=
0
&&
hdr_off
!=
sizeof
(
struct
rds_header
));
...
...
@@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
goto
out
;
}
credit_alloc
=
work_alloc
;
if
(
ic
->
i_flowctl
)
{
credit_alloc
=
rds_ib_send_grab_credits
(
ic
,
work_alloc
,
&
posted
,
0
,
RDS_MAX_ADV_CREDIT
);
adv_credits
+=
posted
;
if
(
credit_alloc
<
work_alloc
)
{
rds_ib_ring_unalloc
(
&
ic
->
i_send_ring
,
work_alloc
-
credit_alloc
);
work_alloc
=
credit_alloc
;
flow_controlled
++
;
flow_controlled
=
1
;
}
if
(
work_alloc
==
0
)
{
set_bit
(
RDS_LL_SEND_FULL
,
&
conn
->
c_flags
);
...
...
@@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
}
/* map the message the first time we see it */
if
(
ic
->
i_rm
==
NULL
)
{
/*
printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
be16_to_cpu(rm->m_inc.i_hdr.h_dport),
rm->m_inc.i_hdr.h_flags,
be32_to_cpu(rm->m_inc.i_hdr.h_len));
*/
if
(
rm
->
m_nents
)
{
rm
->
m_count
=
ib_dma_map_sg
(
dev
,
rm
->
m_sg
,
rm
->
m_nents
,
DMA_TO_DEVICE
);
rdsdebug
(
"ic %p mapping rm %p: %d
\n
"
,
ic
,
rm
,
rm
->
m_count
);
if
(
rm
->
m_count
==
0
)
{
if
(
!
ic
->
i_data_op
)
{
if
(
rm
->
data
.
op_nents
)
{
rm
->
data
.
op_count
=
ib_dma_map_sg
(
dev
,
rm
->
data
.
op_sg
,
rm
->
data
.
op_nents
,
DMA_TO_DEVICE
);
rdsdebug
(
"ic %p mapping rm %p: %d
\n
"
,
ic
,
rm
,
rm
->
data
.
op_count
);
if
(
rm
->
data
.
op_count
==
0
)
{
rds_ib_stats_inc
(
s_ib_tx_sg_mapping_failure
);
rds_ib_ring_unalloc
(
&
ic
->
i_send_ring
,
work_alloc
);
ret
=
-
ENOMEM
;
/* XXX ? */
goto
out
;
}
}
else
{
rm
->
m
_count
=
0
;
rm
->
data
.
op
_count
=
0
;
}
ic
->
i_unsignaled_wrs
=
rds_ib_sysctl_max_unsig_wrs
;
ic
->
i_unsignaled_bytes
=
rds_ib_sysctl_max_unsig_bytes
;
rds_message_addref
(
rm
);
ic
->
i_
rm
=
rm
;
ic
->
i_
data_op
=
&
rm
->
data
;
/* Finalize the header */
if
(
test_bit
(
RDS_MSG_ACK_REQUIRED
,
&
rm
->
m_flags
))
...
...
@@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
if
(
rm
->
m_rdma_op
)
{
if
(
rm
->
rdma
.
op_active
)
{
struct
rds_ext_header_rdma
ext_hdr
;
ext_hdr
.
h_rdma_rkey
=
cpu_to_be32
(
rm
->
m_rdma_op
->
r_
key
);
ext_hdr
.
h_rdma_rkey
=
cpu_to_be32
(
rm
->
rdma
.
op_r
key
);
rds_message_add_extension
(
&
rm
->
m_inc
.
i_hdr
,
RDS_EXTHDR_RDMA
,
&
ext_hdr
,
sizeof
(
ext_hdr
));
}
...
...
@@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
/*
* Update adv_credits since we reset the ACK_REQUIRED bit.
*/
rds_ib_send_grab_credits
(
ic
,
0
,
&
posted
,
1
,
RDS_MAX_ADV_CREDIT
-
adv_credits
);
adv_credits
+=
posted
;
BUG_ON
(
adv_credits
>
255
);
if
(
ic
->
i_flowctl
)
{
rds_ib_send_grab_credits
(
ic
,
0
,
&
posted
,
1
,
RDS_MAX_ADV_CREDIT
-
adv_credits
);
adv_credits
+=
posted
;
BUG_ON
(
adv_credits
>
255
);
}
}
send
=
&
ic
->
i_sends
[
pos
];
first
=
send
;
prev
=
NULL
;
scat
=
&
rm
->
m_sg
[
sg
];
sent
=
0
;
i
=
0
;
/* Sometimes you want to put a fence between an RDMA
* READ and the following SEND.
* We could either do this all the time
* or when requested by the user. Right now, we let
* the application choose.
*/
if
(
rm
->
m_rdma_op
&&
rm
->
m_rdma_op
->
r
_fence
)
if
(
rm
->
rdma
.
op_active
&&
rm
->
rdma
.
op
_fence
)
send_flags
=
IB_SEND_FENCE
;
/*
* We could be copying the header into the unused tail of the page.
* That would need to be changed in the future when those pages might
* be mapped userspace pages or page cache pages. So instead we always
* use a second sge and our long-lived ring of mapped headers. We send
* the header after the data so that the data payload can be aligned on
* the receiver.
*/
/*
Each frag gets a header. Msgs may be 0 bytes */
send
=
&
ic
->
i_sends
[
pos
];
first
=
send
;
prev
=
NULL
;
scat
=
&
ic
->
i_data_op
->
op_sg
[
sg
];
i
=
0
;
do
{
unsigned
int
len
=
0
;
/* handle a 0-len message */
if
(
be32_to_cpu
(
rm
->
m_inc
.
i_hdr
.
h_len
)
==
0
)
{
rds_ib_xmit_populate_wr
(
ic
,
send
,
pos
,
0
,
0
,
send_flags
);
goto
add_header
;
}
/* Set up the header */
send
->
s_wr
.
send_flags
=
send_flags
;
send
->
s_wr
.
opcode
=
IB_WR_SEND
;
send
->
s_wr
.
num_sge
=
1
;
send
->
s_wr
.
next
=
NULL
;
send
->
s_queued
=
jiffies
;
send
->
s_op
=
NULL
;
/* if there's data reference it with a chain of work reqs */
for
(;
i
<
work_alloc
&&
scat
!=
&
rm
->
m_sg
[
rm
->
m_count
];
i
++
)
{
unsigned
int
len
;
send
->
s_sge
[
0
].
addr
=
ic
->
i_send_hdrs_dma
+
(
pos
*
sizeof
(
struct
rds_header
));
send
->
s_sge
[
0
].
length
=
sizeof
(
struct
rds_header
)
;
send
=
&
ic
->
i_sends
[
pos
]
;
memcpy
(
&
ic
->
i_send_hdrs
[
pos
],
&
rm
->
m_inc
.
i_hdr
,
sizeof
(
struct
rds_header
))
;
len
=
min
(
RDS_FRAG_SIZE
,
ib_sg_dma_len
(
dev
,
scat
)
-
off
);
rds_ib_xmit_populate_wr
(
ic
,
send
,
pos
,
ib_sg_dma_address
(
dev
,
scat
)
+
off
,
len
,
send_flags
);
/* Set up the data, if present */
if
(
i
<
work_alloc
&&
scat
!=
&
rm
->
data
.
op_sg
[
rm
->
data
.
op_count
])
{
len
=
min
(
RDS_FRAG_SIZE
,
ib_sg_dma_len
(
dev
,
scat
)
-
off
);
send
->
s_wr
.
num_sge
=
2
;
/*
* We want to delay signaling completions just enough to get
* the batching benefits but not so much that we create dead time
* on the wire.
*/
if
(
ic
->
i_unsignaled_wrs
--
==
0
)
{
ic
->
i_unsignaled_wrs
=
rds_ib_sysctl_max_unsig_wrs
;
send
->
s_wr
.
send_flags
|=
IB_SEND_SIGNALED
|
IB_SEND_SOLICITED
;
}
send
->
s_sge
[
1
].
addr
=
ib_sg_dma_address
(
dev
,
scat
)
+
off
;
send
->
s_sge
[
1
].
length
=
len
;
ic
->
i_unsignaled_bytes
-=
len
;
if
(
ic
->
i_unsignaled_bytes
<=
0
)
{
ic
->
i_unsignaled_bytes
=
rds_ib_sysctl_max_unsig_bytes
;
send
->
s_wr
.
send_flags
|=
IB_SEND_SIGNALED
|
IB_SEND_SOLICITED
;
bytes_sent
+=
len
;
off
+=
len
;
if
(
off
==
ib_sg_dma_len
(
dev
,
scat
))
{
scat
++
;
off
=
0
;
}
}
rds_ib_set_wr_signal_state
(
ic
,
send
,
0
);
/*
* Always signal the last one if we're stopping due to flow control.
*/
if
(
flow_controlled
&&
i
==
(
work_alloc
-
1
))
if
(
ic
->
i_flowctl
&&
flow_controlled
&&
i
==
(
work_alloc
-
1
))
send
->
s_wr
.
send_flags
|=
IB_SEND_SIGNALED
|
IB_SEND_SOLICITED
;
if
(
send
->
s_wr
.
send_flags
&
IB_SEND_SIGNALED
)
nr_sig
++
;
rdsdebug
(
"send %p wr %p num_sge %u next %p
\n
"
,
send
,
&
send
->
s_wr
,
send
->
s_wr
.
num_sge
,
send
->
s_wr
.
next
);
sent
+=
len
;
off
+=
len
;
if
(
off
==
ib_sg_dma_len
(
dev
,
scat
))
{
scat
++
;
off
=
0
;
}
add_header:
/* Tack on the header after the data. The header SGE should already
* have been set up to point to the right header buffer. */
memcpy
(
&
ic
->
i_send_hdrs
[
pos
],
&
rm
->
m_inc
.
i_hdr
,
sizeof
(
struct
rds_header
));
if
(
0
)
{
struct
rds_header
*
hdr
=
&
ic
->
i_send_hdrs
[
pos
];
printk
(
KERN_NOTICE
"send WR dport=%u flags=0x%x len=%d
\n
"
,
be16_to_cpu
(
hdr
->
h_dport
),
hdr
->
h_flags
,
be32_to_cpu
(
hdr
->
h_len
));
}
if
(
adv_credits
)
{
if
(
ic
->
i_flowctl
&&
adv_credits
)
{
struct
rds_header
*
hdr
=
&
ic
->
i_send_hdrs
[
pos
];
/* add credit and redo the header checksum */
...
...
@@ -689,20 +721,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
prev
=
send
;
pos
=
(
pos
+
1
)
%
ic
->
i_send_ring
.
w_nr
;
}
send
=
&
ic
->
i_sends
[
pos
];
i
++
;
}
while
(
i
<
work_alloc
&&
scat
!=
&
rm
->
data
.
op_sg
[
rm
->
data
.
op_count
]);
/* Account the RDS header in the number of bytes we sent, but just once.
* The caller has no concept of fragmentation. */
if
(
hdr_off
==
0
)
sent
+=
sizeof
(
struct
rds_header
);
bytes_
sent
+=
sizeof
(
struct
rds_header
);
/* if we finished the message then send completion owns it */
if
(
scat
==
&
rm
->
m_sg
[
rm
->
m
_count
])
{
prev
->
s_
rm
=
ic
->
i_rm
;
prev
->
s_wr
.
send_flags
|=
IB_SEND_S
IGNALED
|
IB_SEND_S
OLICITED
;
ic
->
i_
rm
=
NULL
;
if
(
scat
==
&
rm
->
data
.
op_sg
[
rm
->
data
.
op
_count
])
{
prev
->
s_
op
=
ic
->
i_data_op
;
prev
->
s_wr
.
send_flags
|=
IB_SEND_SOLICITED
;
ic
->
i_
data_op
=
NULL
;
}
/* Put back wrs & credits we didn't use */
if
(
i
<
work_alloc
)
{
rds_ib_ring_unalloc
(
&
ic
->
i_send_ring
,
work_alloc
-
i
);
work_alloc
=
i
;
...
...
@@ -710,6 +747,9 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
if
(
ic
->
i_flowctl
&&
i
<
credit_alloc
)
rds_ib_send_add_credits
(
conn
,
credit_alloc
-
i
);
if
(
nr_sig
)
atomic_add
(
nr_sig
,
&
ic
->
i_signaled_sends
);
/* XXX need to worry about failed_wr and partial sends. */
failed_wr
=
&
first
->
s_wr
;
ret
=
ib_post_send
(
ic
->
i_cm_id
->
qp
,
&
first
->
s_wr
,
&
failed_wr
);
...
...
@@ -720,32 +760,127 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
printk
(
KERN_WARNING
"RDS/IB: ib_post_send to %pI4 "
"returned %d
\n
"
,
&
conn
->
c_faddr
,
ret
);
rds_ib_ring_unalloc
(
&
ic
->
i_send_ring
,
work_alloc
);
if
(
prev
->
s_rm
)
{
ic
->
i_rm
=
prev
->
s_rm
;
prev
->
s_rm
=
NULL
;
rds_ib_sub_signaled
(
ic
,
nr_sig
);
if
(
prev
->
s_op
)
{
ic
->
i_data_op
=
prev
->
s_op
;
prev
->
s_op
=
NULL
;
}
rds_ib_conn_error
(
ic
->
conn
,
"ib_post_send failed
\n
"
);
goto
out
;
}
ret
=
sent
;
ret
=
bytes_
sent
;
out:
BUG_ON
(
adv_credits
);
return
ret
;
}
int
rds_ib_xmit_rdma
(
struct
rds_connection
*
conn
,
struct
rds_rdma_op
*
op
)
/*
* Issue atomic operation.
* A simplified version of the rdma case, we always map 1 SG, and
* only 8 bytes, for the return value from the atomic operation.
*/
int
rds_ib_xmit_atomic
(
struct
rds_connection
*
conn
,
struct
rm_atomic_op
*
op
)
{
struct
rds_ib_connection
*
ic
=
conn
->
c_transport_data
;
struct
rds_ib_send_work
*
send
=
NULL
;
struct
ib_send_wr
*
failed_wr
;
struct
rds_ib_device
*
rds_ibdev
;
u32
pos
;
u32
work_alloc
;
int
ret
;
int
nr_sig
=
0
;
rds_ibdev
=
ib_get_client_data
(
ic
->
i_cm_id
->
device
,
&
rds_ib_client
);
work_alloc
=
rds_ib_ring_alloc
(
&
ic
->
i_send_ring
,
1
,
&
pos
);
if
(
work_alloc
!=
1
)
{
rds_ib_ring_unalloc
(
&
ic
->
i_send_ring
,
work_alloc
);
rds_ib_stats_inc
(
s_ib_tx_ring_full
);
ret
=
-
ENOMEM
;
goto
out
;
}
/* address of send request in ring */
send
=
&
ic
->
i_sends
[
pos
];
send
->
s_queued
=
jiffies
;
if
(
op
->
op_type
==
RDS_ATOMIC_TYPE_CSWP
)
{
send
->
s_wr
.
opcode
=
IB_WR_MASKED_ATOMIC_CMP_AND_SWP
;
send
->
s_wr
.
wr
.
atomic
.
compare_add
=
op
->
op_m_cswp
.
compare
;
send
->
s_wr
.
wr
.
atomic
.
swap
=
op
->
op_m_cswp
.
swap
;
send
->
s_wr
.
wr
.
atomic
.
compare_add_mask
=
op
->
op_m_cswp
.
compare_mask
;
send
->
s_wr
.
wr
.
atomic
.
swap_mask
=
op
->
op_m_cswp
.
swap_mask
;
}
else
{
/* FADD */
send
->
s_wr
.
opcode
=
IB_WR_MASKED_ATOMIC_FETCH_AND_ADD
;
send
->
s_wr
.
wr
.
atomic
.
compare_add
=
op
->
op_m_fadd
.
add
;
send
->
s_wr
.
wr
.
atomic
.
swap
=
0
;
send
->
s_wr
.
wr
.
atomic
.
compare_add_mask
=
op
->
op_m_fadd
.
nocarry_mask
;
send
->
s_wr
.
wr
.
atomic
.
swap_mask
=
0
;
}
nr_sig
=
rds_ib_set_wr_signal_state
(
ic
,
send
,
op
->
op_notify
);
send
->
s_wr
.
num_sge
=
1
;
send
->
s_wr
.
next
=
NULL
;
send
->
s_wr
.
wr
.
atomic
.
remote_addr
=
op
->
op_remote_addr
;
send
->
s_wr
.
wr
.
atomic
.
rkey
=
op
->
op_rkey
;
send
->
s_op
=
op
;
rds_message_addref
(
container_of
(
send
->
s_op
,
struct
rds_message
,
atomic
));
/* map 8 byte retval buffer to the device */
ret
=
ib_dma_map_sg
(
ic
->
i_cm_id
->
device
,
op
->
op_sg
,
1
,
DMA_FROM_DEVICE
);
rdsdebug
(
"ic %p mapping atomic op %p. mapped %d pg
\n
"
,
ic
,
op
,
ret
);
if
(
ret
!=
1
)
{
rds_ib_ring_unalloc
(
&
ic
->
i_send_ring
,
work_alloc
);
rds_ib_stats_inc
(
s_ib_tx_sg_mapping_failure
);
ret
=
-
ENOMEM
;
/* XXX ? */
goto
out
;
}
/* Convert our struct scatterlist to struct ib_sge */
send
->
s_sge
[
0
].
addr
=
ib_sg_dma_address
(
ic
->
i_cm_id
->
device
,
op
->
op_sg
);
send
->
s_sge
[
0
].
length
=
ib_sg_dma_len
(
ic
->
i_cm_id
->
device
,
op
->
op_sg
);
send
->
s_sge
[
0
].
lkey
=
ic
->
i_mr
->
lkey
;
rdsdebug
(
"rva %Lx rpa %Lx len %u
\n
"
,
op
->
op_remote_addr
,
send
->
s_sge
[
0
].
addr
,
send
->
s_sge
[
0
].
length
);
if
(
nr_sig
)
atomic_add
(
nr_sig
,
&
ic
->
i_signaled_sends
);
failed_wr
=
&
send
->
s_wr
;
ret
=
ib_post_send
(
ic
->
i_cm_id
->
qp
,
&
send
->
s_wr
,
&
failed_wr
);
rdsdebug
(
"ic %p send %p (wr %p) ret %d wr %p
\n
"
,
ic
,
send
,
&
send
->
s_wr
,
ret
,
failed_wr
);
BUG_ON
(
failed_wr
!=
&
send
->
s_wr
);
if
(
ret
)
{
printk
(
KERN_WARNING
"RDS/IB: atomic ib_post_send to %pI4 "
"returned %d
\n
"
,
&
conn
->
c_faddr
,
ret
);
rds_ib_ring_unalloc
(
&
ic
->
i_send_ring
,
work_alloc
);
rds_ib_sub_signaled
(
ic
,
nr_sig
);
goto
out
;
}
if
(
unlikely
(
failed_wr
!=
&
send
->
s_wr
))
{
printk
(
KERN_WARNING
"RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!
\n
"
,
ret
);
BUG_ON
(
failed_wr
!=
&
send
->
s_wr
);
}
out:
return
ret
;
}
int
rds_ib_xmit_rdma
(
struct
rds_connection
*
conn
,
struct
rm_rdma_op
*
op
)
{
struct
rds_ib_connection
*
ic
=
conn
->
c_transport_data
;
struct
rds_ib_send_work
*
send
=
NULL
;
struct
rds_ib_send_work
*
first
;
struct
rds_ib_send_work
*
prev
;
struct
ib_send_wr
*
failed_wr
;
struct
rds_ib_device
*
rds_ibdev
;
struct
scatterlist
*
scat
;
unsigned
long
len
;
u64
remote_addr
=
op
->
r_remote_addr
;
u64
remote_addr
=
op
->
op_remote_addr
;
u32
max_sge
=
ic
->
rds_ibdev
->
max_sge
;
u32
pos
;
u32
work_alloc
;
u32
i
;
...
...
@@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
int
sent
;
int
ret
;
int
num_sge
;
rds_ibdev
=
ib_get_client_data
(
ic
->
i_cm_id
->
device
,
&
rds_ib_client
);
/* map the message the first time we see it */
if
(
!
op
->
r_mapped
)
{
op
->
r_count
=
ib_dma_map_sg
(
ic
->
i_cm_id
->
device
,
op
->
r_sg
,
op
->
r_nents
,
(
op
->
r_write
)
?
DMA_TO_DEVICE
:
DMA_FROM_DEVICE
);
rdsdebug
(
"ic %p mapping op %p: %d
\n
"
,
ic
,
op
,
op
->
r_count
);
if
(
op
->
r_count
==
0
)
{
int
nr_sig
=
0
;
/* map the op the first time we see it */
if
(
!
op
->
op_mapped
)
{
op
->
op_count
=
ib_dma_map_sg
(
ic
->
i_cm_id
->
device
,
op
->
op_sg
,
op
->
op_nents
,
(
op
->
op_write
)
?
DMA_TO_DEVICE
:
DMA_FROM_DEVICE
);
rdsdebug
(
"ic %p mapping op %p: %d
\n
"
,
ic
,
op
,
op
->
op_count
);
if
(
op
->
op_count
==
0
)
{
rds_ib_stats_inc
(
s_ib_tx_sg_mapping_failure
);
ret
=
-
ENOMEM
;
/* XXX ? */
goto
out
;
}
op
->
r
_mapped
=
1
;
op
->
op
_mapped
=
1
;
}
/*
* Instead of knowing how to return a partial rdma read/write we insist that there
* be enough work requests to send the entire message.
*/
i
=
ceil
(
op
->
r_count
,
rds_ibdev
->
max_sge
);
i
=
ceil
(
op
->
op_count
,
max_sge
);
work_alloc
=
rds_ib_ring_alloc
(
&
ic
->
i_send_ring
,
i
,
&
pos
);
if
(
work_alloc
!=
i
)
{
...
...
@@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
send
=
&
ic
->
i_sends
[
pos
];
first
=
send
;
prev
=
NULL
;
scat
=
&
op
->
r
_sg
[
0
];
scat
=
&
op
->
op
_sg
[
0
];
sent
=
0
;
num_sge
=
op
->
r
_count
;
num_sge
=
op
->
op
_count
;
for
(
i
=
0
;
i
<
work_alloc
&&
scat
!=
&
op
->
r_sg
[
op
->
r
_count
];
i
++
)
{
for
(
i
=
0
;
i
<
work_alloc
&&
scat
!=
&
op
->
op_sg
[
op
->
op
_count
];
i
++
)
{
send
->
s_wr
.
send_flags
=
0
;
send
->
s_queued
=
jiffies
;
/*
* We want to delay signaling completions just enough to get
* the batching benefits but not so much that we create dead time on the wire.
*/
if
(
ic
->
i_unsignaled_wrs
--
==
0
)
{
ic
->
i_unsignaled_wrs
=
rds_ib_sysctl_max_unsig_wrs
;
send
->
s_wr
.
send_flags
=
IB_SEND_SIGNALED
;
}
send
->
s_op
=
NULL
;
nr_sig
+=
rds_ib_set_wr_signal_state
(
ic
,
send
,
op
->
op_notify
);
send
->
s_wr
.
opcode
=
op
->
r
_write
?
IB_WR_RDMA_WRITE
:
IB_WR_RDMA_READ
;
send
->
s_wr
.
opcode
=
op
->
op
_write
?
IB_WR_RDMA_WRITE
:
IB_WR_RDMA_READ
;
send
->
s_wr
.
wr
.
rdma
.
remote_addr
=
remote_addr
;
send
->
s_wr
.
wr
.
rdma
.
rkey
=
op
->
r_key
;
send
->
s_op
=
op
;
send
->
s_wr
.
wr
.
rdma
.
rkey
=
op
->
op_rkey
;
if
(
num_sge
>
rds_ibdev
->
max_sge
)
{
send
->
s_wr
.
num_sge
=
rds_ibdev
->
max_sge
;
num_sge
-=
rds_ibdev
->
max_sge
;
if
(
num_sge
>
max_sge
)
{
send
->
s_wr
.
num_sge
=
max_sge
;
num_sge
-=
max_sge
;
}
else
{
send
->
s_wr
.
num_sge
=
num_sge
;
}
...
...
@@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
if
(
prev
)
prev
->
s_wr
.
next
=
&
send
->
s_wr
;
for
(
j
=
0
;
j
<
send
->
s_wr
.
num_sge
&&
scat
!=
&
op
->
r_sg
[
op
->
r
_count
];
j
++
)
{
for
(
j
=
0
;
j
<
send
->
s_wr
.
num_sge
&&
scat
!=
&
op
->
op_sg
[
op
->
op
_count
];
j
++
)
{
len
=
ib_sg_dma_len
(
ic
->
i_cm_id
->
device
,
scat
);
send
->
s_sge
[
j
].
addr
=
ib_sg_dma_address
(
ic
->
i_cm_id
->
device
,
scat
);
...
...
@@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
send
=
ic
->
i_sends
;
}
/* if we finished the message then send completion owns it */
if
(
scat
==
&
op
->
r_sg
[
op
->
r_count
])
prev
->
s_wr
.
send_flags
=
IB_SEND_SIGNALED
;
/* give a reference to the last op */
if
(
scat
==
&
op
->
op_sg
[
op
->
op_count
])
{
prev
->
s_op
=
op
;
rds_message_addref
(
container_of
(
op
,
struct
rds_message
,
rdma
));
}
if
(
i
<
work_alloc
)
{
rds_ib_ring_unalloc
(
&
ic
->
i_send_ring
,
work_alloc
-
i
);
work_alloc
=
i
;
}
if
(
nr_sig
)
atomic_add
(
nr_sig
,
&
ic
->
i_signaled_sends
);
failed_wr
=
&
first
->
s_wr
;
ret
=
ib_post_send
(
ic
->
i_cm_id
->
qp
,
&
first
->
s_wr
,
&
failed_wr
);
rdsdebug
(
"ic %p first %p (wr %p) ret %d wr %p
\n
"
,
ic
,
...
...
@@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
printk
(
KERN_WARNING
"RDS/IB: rdma ib_post_send to %pI4 "
"returned %d
\n
"
,
&
conn
->
c_faddr
,
ret
);
rds_ib_ring_unalloc
(
&
ic
->
i_send_ring
,
work_alloc
);
rds_ib_sub_signaled
(
ic
,
nr_sig
);
goto
out
;
}
...
...
net/rds/ib_stats.c
View file @
cf0ac2b8
...
...
@@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = {
"ib_rdma_mr_pool_flush"
,
"ib_rdma_mr_pool_wait"
,
"ib_rdma_mr_pool_depleted"
,
"ib_atomic_cswp"
,
"ib_atomic_fadd"
,
};
unsigned
int
rds_ib_stats_info_copy
(
struct
rds_info_iterator
*
iter
,
...
...
net/rds/ib_sysctl.c
View file @
cf0ac2b8
...
...
@@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
static
unsigned
long
rds_ib_sysctl_max_unsig_wr_min
=
1
;
static
unsigned
long
rds_ib_sysctl_max_unsig_wr_max
=
64
;
unsigned
long
rds_ib_sysctl_max_unsig_bytes
=
(
16
<<
20
);
static
unsigned
long
rds_ib_sysctl_max_unsig_bytes_min
=
1
;
static
unsigned
long
rds_ib_sysctl_max_unsig_bytes_max
=
~
0UL
;
/*
* This sysctl does nothing.
*
...
...
@@ -93,15 +89,6 @@ ctl_table rds_ib_sysctl_table[] = {
.
extra1
=
&
rds_ib_sysctl_max_unsig_wr_min
,
.
extra2
=
&
rds_ib_sysctl_max_unsig_wr_max
,
},
{
.
procname
=
"max_unsignaled_bytes"
,
.
data
=
&
rds_ib_sysctl_max_unsig_bytes
,
.
maxlen
=
sizeof
(
unsigned
long
),
.
mode
=
0644
,
.
proc_handler
=
proc_doulongvec_minmax
,
.
extra1
=
&
rds_ib_sysctl_max_unsig_bytes_min
,
.
extra2
=
&
rds_ib_sysctl_max_unsig_bytes_max
,
},
{
.
procname
=
"max_recv_allocation"
,
.
data
=
&
rds_ib_sysctl_max_recv_allocation
,
...
...
@@ -132,10 +119,10 @@ void rds_ib_sysctl_exit(void)
unregister_sysctl_table
(
rds_ib_sysctl_hdr
);
}
int
__init
rds_ib_sysctl_init
(
void
)
int
rds_ib_sysctl_init
(
void
)
{
rds_ib_sysctl_hdr
=
register_sysctl_paths
(
rds_ib_sysctl_path
,
rds_ib_sysctl_table
);
if
(
rds_ib_sysctl_hdr
==
NULL
)
if
(
!
rds_ib_sysctl_hdr
)
return
-
ENOMEM
;
return
0
;
}
net/rds/info.c
View file @
cf0ac2b8
...
...
@@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func)
BUG_ON
(
optname
<
RDS_INFO_FIRST
||
optname
>
RDS_INFO_LAST
);
spin_lock
(
&
rds_info_lock
);
BUG_ON
(
rds_info_funcs
[
offset
]
!=
NULL
);
BUG_ON
(
rds_info_funcs
[
offset
]);
rds_info_funcs
[
offset
]
=
func
;
spin_unlock
(
&
rds_info_lock
);
}
...
...
@@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func);
*/
void
rds_info_iter_unmap
(
struct
rds_info_iterator
*
iter
)
{
if
(
iter
->
addr
!=
NULL
)
{
if
(
iter
->
addr
)
{
kunmap_atomic
(
iter
->
addr
,
KM_USER0
);
iter
->
addr
=
NULL
;
}
...
...
@@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
unsigned
long
this
;
while
(
bytes
)
{
if
(
iter
->
addr
==
NULL
)
if
(
!
iter
->
addr
)
iter
->
addr
=
kmap_atomic
(
*
iter
->
pages
,
KM_USER0
);
this
=
min
(
bytes
,
PAGE_SIZE
-
iter
->
offset
);
...
...
@@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
>>
PAGE_SHIFT
;
pages
=
kmalloc
(
nr_pages
*
sizeof
(
struct
page
*
),
GFP_KERNEL
);
if
(
pages
==
NULL
)
{
if
(
!
pages
)
{
ret
=
-
ENOMEM
;
goto
out
;
}
...
...
@@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
call_func:
func
=
rds_info_funcs
[
optname
-
RDS_INFO_FIRST
];
if
(
func
==
NULL
)
{
if
(
!
func
)
{
ret
=
-
ENOPROTOOPT
;
goto
out
;
}
...
...
@@ -234,7 +234,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
ret
=
-
EFAULT
;
out:
for
(
i
=
0
;
pages
!=
NULL
&&
i
<
nr_pages
;
i
++
)
for
(
i
=
0
;
pages
&&
i
<
nr_pages
;
i
++
)
put_page
(
pages
[
i
]);
kfree
(
pages
);
...
...
net/rds/iw.c
View file @
cf0ac2b8
...
...
@@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = {
.
laddr_check
=
rds_iw_laddr_check
,
.
xmit_complete
=
rds_iw_xmit_complete
,
.
xmit
=
rds_iw_xmit
,
.
xmit_cong_map
=
NULL
,
.
xmit_rdma
=
rds_iw_xmit_rdma
,
.
recv
=
rds_iw_recv
,
.
conn_alloc
=
rds_iw_conn_alloc
,
...
...
@@ -272,7 +271,6 @@ struct rds_transport rds_iw_transport = {
.
conn_connect
=
rds_iw_conn_connect
,
.
conn_shutdown
=
rds_iw_conn_shutdown
,
.
inc_copy_to_user
=
rds_iw_inc_copy_to_user
,
.
inc_purge
=
rds_iw_inc_purge
,
.
inc_free
=
rds_iw_inc_free
,
.
cm_initiate_connect
=
rds_iw_cm_initiate_connect
,
.
cm_handle_connect
=
rds_iw_cm_handle_connect
,
...
...
@@ -289,7 +287,7 @@ struct rds_transport rds_iw_transport = {
.
t_prefer_loopback
=
1
,
};
int
__init
rds_iw_init
(
void
)
int
rds_iw_init
(
void
)
{
int
ret
;
...
...
net/rds/iw.h
View file @
cf0ac2b8
...
...
@@ -70,7 +70,7 @@ struct rds_iw_send_work {
struct
rds_message
*
s_rm
;
/* We should really put these into a union: */
struct
r
ds
_rdma_op
*
s_op
;
struct
r
m
_rdma_op
*
s_op
;
struct
rds_iw_mapping
*
s_mapping
;
struct
ib_mr
*
s_mr
;
struct
ib_fast_reg_page_list
*
s_page_list
;
...
...
@@ -284,7 +284,7 @@ void rds_iw_conn_free(void *arg);
int
rds_iw_conn_connect
(
struct
rds_connection
*
conn
);
void
rds_iw_conn_shutdown
(
struct
rds_connection
*
conn
);
void
rds_iw_state_change
(
struct
sock
*
sk
);
int
__init
rds_iw_listen_init
(
void
);
int
rds_iw_listen_init
(
void
);
void
rds_iw_listen_stop
(
void
);
void
__rds_iw_conn_error
(
struct
rds_connection
*
conn
,
const
char
*
,
...);
int
rds_iw_cm_handle_connect
(
struct
rdma_cm_id
*
cm_id
,
...
...
@@ -321,12 +321,11 @@ void rds_iw_flush_mrs(void);
void
rds_iw_remove_cm_id
(
struct
rds_iw_device
*
rds_iwdev
,
struct
rdma_cm_id
*
cm_id
);
/* ib_recv.c */
int
__init
rds_iw_recv_init
(
void
);
int
rds_iw_recv_init
(
void
);
void
rds_iw_recv_exit
(
void
);
int
rds_iw_recv
(
struct
rds_connection
*
conn
);
int
rds_iw_recv_refill
(
struct
rds_connection
*
conn
,
gfp_t
kptr_gfp
,
gfp_t
page_gfp
,
int
prefill
);
void
rds_iw_inc_purge
(
struct
rds_incoming
*
inc
);
void
rds_iw_inc_free
(
struct
rds_incoming
*
inc
);
int
rds_iw_inc_copy_to_user
(
struct
rds_incoming
*
inc
,
struct
iovec
*
iov
,
size_t
size
);
...
...
@@ -358,7 +357,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
void
rds_iw_send_cq_comp_handler
(
struct
ib_cq
*
cq
,
void
*
context
);
void
rds_iw_send_init_ring
(
struct
rds_iw_connection
*
ic
);
void
rds_iw_send_clear_ring
(
struct
rds_iw_connection
*
ic
);
int
rds_iw_xmit_rdma
(
struct
rds_connection
*
conn
,
struct
r
ds
_rdma_op
*
op
);
int
rds_iw_xmit_rdma
(
struct
rds_connection
*
conn
,
struct
r
m
_rdma_op
*
op
);
void
rds_iw_send_add_credits
(
struct
rds_connection
*
conn
,
unsigned
int
credits
);
void
rds_iw_advertise_credits
(
struct
rds_connection
*
conn
,
unsigned
int
posted
);
int
rds_iw_send_grab_credits
(
struct
rds_iw_connection
*
ic
,
u32
wanted
,
...
...
@@ -371,7 +370,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
unsigned
int
avail
);
/* ib_sysctl.c */
int
__init
rds_iw_sysctl_init
(
void
);
int
rds_iw_sysctl_init
(
void
);
void
rds_iw_sysctl_exit
(
void
);
extern
unsigned
long
rds_iw_sysctl_max_send_wr
;
extern
unsigned
long
rds_iw_sysctl_max_recv_wr
;
...
...
net/rds/iw_cm.c
View file @
cf0ac2b8
...
...
@@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
* the rds_iwdev at all.
*/
rds_iwdev
=
ib_get_client_data
(
dev
,
&
rds_iw_client
);
if
(
rds_iwdev
==
NULL
)
{
if
(
!
rds_iwdev
)
{
if
(
printk_ratelimit
())
printk
(
KERN_NOTICE
"RDS/IW: No client_data for device %s
\n
"
,
dev
->
name
);
...
...
@@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
ic
->
i_send_ring
.
w_nr
*
sizeof
(
struct
rds_header
),
&
ic
->
i_send_hdrs_dma
,
GFP_KERNEL
);
if
(
ic
->
i_send_hdrs
==
NULL
)
{
if
(
!
ic
->
i_send_hdrs
)
{
ret
=
-
ENOMEM
;
rdsdebug
(
"ib_dma_alloc_coherent send failed
\n
"
);
goto
out
;
...
...
@@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
ic
->
i_recv_ring
.
w_nr
*
sizeof
(
struct
rds_header
),
&
ic
->
i_recv_hdrs_dma
,
GFP_KERNEL
);
if
(
ic
->
i_recv_hdrs
==
NULL
)
{
if
(
!
ic
->
i_recv_hdrs
)
{
ret
=
-
ENOMEM
;
rdsdebug
(
"ib_dma_alloc_coherent recv failed
\n
"
);
goto
out
;
...
...
@@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
ic
->
i_ack
=
ib_dma_alloc_coherent
(
dev
,
sizeof
(
struct
rds_header
),
&
ic
->
i_ack_dma
,
GFP_KERNEL
);
if
(
ic
->
i_ack
==
NULL
)
{
if
(
!
ic
->
i_ack
)
{
ret
=
-
ENOMEM
;
rdsdebug
(
"ib_dma_alloc_coherent ack failed
\n
"
);
goto
out
;
}
ic
->
i_sends
=
vmalloc
(
ic
->
i_send_ring
.
w_nr
*
sizeof
(
struct
rds_iw_send_work
));
if
(
ic
->
i_sends
==
NULL
)
{
if
(
!
ic
->
i_sends
)
{
ret
=
-
ENOMEM
;
rdsdebug
(
"send allocation failed
\n
"
);
goto
out
;
...
...
@@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
rds_iw_send_init_ring
(
ic
);
ic
->
i_recvs
=
vmalloc
(
ic
->
i_recv_ring
.
w_nr
*
sizeof
(
struct
rds_iw_recv_work
));
if
(
ic
->
i_recvs
==
NULL
)
{
if
(
!
ic
->
i_recvs
)
{
ret
=
-
ENOMEM
;
rdsdebug
(
"recv allocation failed
\n
"
);
goto
out
;
...
...
@@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
/* XXX too lazy? */
ic
=
kzalloc
(
sizeof
(
struct
rds_iw_connection
),
GFP_KERNEL
);
if
(
ic
==
NULL
)
if
(
!
ic
)
return
-
ENOMEM
;
INIT_LIST_HEAD
(
&
ic
->
iw_node
);
...
...
net/rds/iw_rdma.c
View file @
cf0ac2b8
...
...
@@ -34,7 +34,6 @@
#include <linux/slab.h>
#include "rds.h"
#include "rdma.h"
#include "iw.h"
...
...
net/rds/iw_recv.c
View file @
cf0ac2b8
...
...
@@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
static
void
rds_iw_frag_free
(
struct
rds_page_frag
*
frag
)
{
rdsdebug
(
"frag %p page %p
\n
"
,
frag
,
frag
->
f_page
);
BUG_ON
(
frag
->
f_page
!=
NULL
);
BUG_ON
(
frag
->
f_page
);
kmem_cache_free
(
rds_iw_frag_slab
,
frag
);
}
...
...
@@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
struct
ib_sge
*
sge
;
int
ret
=
-
ENOMEM
;
if
(
recv
->
r_iwinc
==
NULL
)
{
if
(
!
recv
->
r_iwinc
)
{
if
(
!
atomic_add_unless
(
&
rds_iw_allocation
,
1
,
rds_iw_sysctl_max_recv_allocation
))
{
rds_iw_stats_inc
(
s_iw_rx_alloc_limit
);
goto
out
;
}
recv
->
r_iwinc
=
kmem_cache_alloc
(
rds_iw_incoming_slab
,
kptr_gfp
);
if
(
recv
->
r_iwinc
==
NULL
)
{
if
(
!
recv
->
r_iwinc
)
{
atomic_dec
(
&
rds_iw_allocation
);
goto
out
;
}
...
...
@@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
rds_inc_init
(
&
recv
->
r_iwinc
->
ii_inc
,
conn
,
conn
->
c_faddr
);
}
if
(
recv
->
r_frag
==
NULL
)
{
if
(
!
recv
->
r_frag
)
{
recv
->
r_frag
=
kmem_cache_alloc
(
rds_iw_frag_slab
,
kptr_gfp
);
if
(
recv
->
r_frag
==
NULL
)
if
(
!
recv
->
r_frag
)
goto
out
;
INIT_LIST_HEAD
(
&
recv
->
r_frag
->
f_item
);
recv
->
r_frag
->
f_page
=
NULL
;
}
if
(
ic
->
i_frag
.
f_page
==
NULL
)
{
if
(
!
ic
->
i_frag
.
f_page
)
{
ic
->
i_frag
.
f_page
=
alloc_page
(
page_gfp
);
if
(
ic
->
i_frag
.
f_page
==
NULL
)
if
(
!
ic
->
i_frag
.
f_page
)
goto
out
;
ic
->
i_frag
.
f_offset
=
0
;
}
...
...
@@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
return
ret
;
}
void
rds_iw_inc_purge
(
struct
rds_incoming
*
inc
)
static
void
rds_iw_inc_purge
(
struct
rds_incoming
*
inc
)
{
struct
rds_iw_incoming
*
iwinc
;
struct
rds_page_frag
*
frag
;
...
...
@@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
* into the inc and save the inc so we can hang upcoming fragments
* off its list.
*/
if
(
iwinc
==
NULL
)
{
if
(
!
iwinc
)
{
iwinc
=
recv
->
r_iwinc
;
recv
->
r_iwinc
=
NULL
;
ic
->
i_iwinc
=
iwinc
;
...
...
@@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn)
return
ret
;
}
int
__init
rds_iw_recv_init
(
void
)
int
rds_iw_recv_init
(
void
)
{
struct
sysinfo
si
;
int
ret
=
-
ENOMEM
;
...
...
@@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void)
rds_iw_incoming_slab
=
kmem_cache_create
(
"rds_iw_incoming"
,
sizeof
(
struct
rds_iw_incoming
),
0
,
0
,
NULL
);
if
(
rds_iw_incoming_slab
==
NULL
)
if
(
!
rds_iw_incoming_slab
)
goto
out
;
rds_iw_frag_slab
=
kmem_cache_create
(
"rds_iw_frag"
,
sizeof
(
struct
rds_page_frag
),
0
,
0
,
NULL
);
if
(
rds_iw_frag_slab
==
NULL
)
if
(
!
rds_iw_frag_slab
)
kmem_cache_destroy
(
rds_iw_incoming_slab
);
else
ret
=
0
;
...
...
net/rds/iw_send.c
View file @
cf0ac2b8
...
...
@@ -36,7 +36,6 @@
#include <linux/dmapool.h>
#include "rds.h"
#include "rdma.h"
#include "iw.h"
static
void
rds_iw_send_rdma_complete
(
struct
rds_message
*
rm
,
...
...
@@ -64,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm,
}
static
void
rds_iw_send_unmap_rdma
(
struct
rds_iw_connection
*
ic
,
struct
r
ds
_rdma_op
*
op
)
struct
r
m
_rdma_op
*
op
)
{
if
(
op
->
r
_mapped
)
{
if
(
op
->
op
_mapped
)
{
ib_dma_unmap_sg
(
ic
->
i_cm_id
->
device
,
op
->
r_sg
,
op
->
r
_nents
,
op
->
r
_write
?
DMA_TO_DEVICE
:
DMA_FROM_DEVICE
);
op
->
r
_mapped
=
0
;
op
->
op_sg
,
op
->
op
_nents
,
op
->
op
_write
?
DMA_TO_DEVICE
:
DMA_FROM_DEVICE
);
op
->
op
_mapped
=
0
;
}
}
...
...
@@ -83,11 +82,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
rdsdebug
(
"ic %p send %p rm %p
\n
"
,
ic
,
send
,
rm
);
ib_dma_unmap_sg
(
ic
->
i_cm_id
->
device
,
rm
->
m_sg
,
rm
->
m
_nents
,
rm
->
data
.
op_sg
,
rm
->
data
.
op
_nents
,
DMA_TO_DEVICE
);
if
(
rm
->
m_rdma_op
!=
NULL
)
{
rds_iw_send_unmap_rdma
(
ic
,
rm
->
m_rdma_op
);
if
(
rm
->
rdma
.
op_active
)
{
rds_iw_send_unmap_rdma
(
ic
,
&
rm
->
rdma
);
/* If the user asked for a completion notification on this
* message, we can implement three different semantics:
...
...
@@ -111,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
*/
rds_iw_send_rdma_complete
(
rm
,
wc_status
);
if
(
rm
->
m_rdma_op
->
r
_write
)
rds_stats_add
(
s_send_rdma_bytes
,
rm
->
m_rdma_op
->
r
_bytes
);
if
(
rm
->
rdma
.
op
_write
)
rds_stats_add
(
s_send_rdma_bytes
,
rm
->
rdma
.
op
_bytes
);
else
rds_stats_add
(
s_recv_rdma_bytes
,
rm
->
m_rdma_op
->
r
_bytes
);
rds_stats_add
(
s_recv_rdma_bytes
,
rm
->
rdma
.
op
_bytes
);
}
/* If anyone waited for this message to get flushed out, wake
...
...
@@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
}
/* map the message the first time we see it */
if
(
ic
->
i_rm
==
NULL
)
{
if
(
!
ic
->
i_rm
)
{
/*
printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
be16_to_cpu(rm->m_inc.i_hdr.h_dport),
rm->m_inc.i_hdr.h_flags,
be32_to_cpu(rm->m_inc.i_hdr.h_len));
*/
if
(
rm
->
m_nents
)
{
rm
->
m_count
=
ib_dma_map_sg
(
dev
,
rm
->
m_sg
,
rm
->
m_nents
,
DMA_TO_DEVICE
);
rdsdebug
(
"ic %p mapping rm %p: %d
\n
"
,
ic
,
rm
,
rm
->
m_count
);
if
(
rm
->
m_count
==
0
)
{
if
(
rm
->
data
.
op_nents
)
{
rm
->
data
.
op_count
=
ib_dma_map_sg
(
dev
,
rm
->
data
.
op_sg
,
rm
->
data
.
op_nents
,
DMA_TO_DEVICE
);
rdsdebug
(
"ic %p mapping rm %p: %d
\n
"
,
ic
,
rm
,
rm
->
data
.
op_count
);
if
(
rm
->
data
.
op_count
==
0
)
{
rds_iw_stats_inc
(
s_iw_tx_sg_mapping_failure
);
rds_iw_ring_unalloc
(
&
ic
->
i_send_ring
,
work_alloc
);
ret
=
-
ENOMEM
;
/* XXX ? */
goto
out
;
}
}
else
{
rm
->
m
_count
=
0
;
rm
->
data
.
op
_count
=
0
;
}
ic
->
i_unsignaled_wrs
=
rds_iw_sysctl_max_unsig_wrs
;
...
...
@@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
if
(
rm
->
m_rdma_op
)
{
if
(
rm
->
rdma
.
op_active
)
{
struct
rds_ext_header_rdma
ext_hdr
;
ext_hdr
.
h_rdma_rkey
=
cpu_to_be32
(
rm
->
m_rdma_op
->
r_
key
);
ext_hdr
.
h_rdma_rkey
=
cpu_to_be32
(
rm
->
rdma
.
op_r
key
);
rds_message_add_extension
(
&
rm
->
m_inc
.
i_hdr
,
RDS_EXTHDR_RDMA
,
&
ext_hdr
,
sizeof
(
ext_hdr
));
}
...
...
@@ -621,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
send
=
&
ic
->
i_sends
[
pos
];
first
=
send
;
prev
=
NULL
;
scat
=
&
rm
->
m
_sg
[
sg
];
scat
=
&
rm
->
data
.
op
_sg
[
sg
];
sent
=
0
;
i
=
0
;
...
...
@@ -631,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
* or when requested by the user. Right now, we let
* the application choose.
*/
if
(
rm
->
m_rdma_op
&&
rm
->
m_rdma_op
->
r
_fence
)
if
(
rm
->
rdma
.
op_active
&&
rm
->
rdma
.
op
_fence
)
send_flags
=
IB_SEND_FENCE
;
/*
...
...
@@ -650,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
}
/* if there's data reference it with a chain of work reqs */
for
(;
i
<
work_alloc
&&
scat
!=
&
rm
->
m_sg
[
rm
->
m
_count
];
i
++
)
{
for
(;
i
<
work_alloc
&&
scat
!=
&
rm
->
data
.
op_sg
[
rm
->
data
.
op
_count
];
i
++
)
{
unsigned
int
len
;
send
=
&
ic
->
i_sends
[
pos
];
...
...
@@ -728,7 +729,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
sent
+=
sizeof
(
struct
rds_header
);
/* if we finished the message then send completion owns it */
if
(
scat
==
&
rm
->
m_sg
[
rm
->
m
_count
])
{
if
(
scat
==
&
rm
->
data
.
op_sg
[
rm
->
data
.
op
_count
])
{
prev
->
s_rm
=
ic
->
i_rm
;
prev
->
s_wr
.
send_flags
|=
IB_SEND_SIGNALED
|
IB_SEND_SOLICITED
;
ic
->
i_rm
=
NULL
;
...
...
@@ -784,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd
ib_update_fast_reg_key
(
send
->
s_mr
,
send
->
s_remap_count
++
);
}
int
rds_iw_xmit_rdma
(
struct
rds_connection
*
conn
,
struct
r
ds
_rdma_op
*
op
)
int
rds_iw_xmit_rdma
(
struct
rds_connection
*
conn
,
struct
r
m
_rdma_op
*
op
)
{
struct
rds_iw_connection
*
ic
=
conn
->
c_transport_data
;
struct
rds_iw_send_work
*
send
=
NULL
;
...
...
@@ -794,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
struct
rds_iw_device
*
rds_iwdev
;
struct
scatterlist
*
scat
;
unsigned
long
len
;
u64
remote_addr
=
op
->
r
_remote_addr
;
u64
remote_addr
=
op
->
op
_remote_addr
;
u32
pos
,
fr_pos
;
u32
work_alloc
;
u32
i
;
...
...
@@ -806,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
rds_iwdev
=
ib_get_client_data
(
ic
->
i_cm_id
->
device
,
&
rds_iw_client
);
/* map the message the first time we see it */
if
(
!
op
->
r
_mapped
)
{
op
->
r
_count
=
ib_dma_map_sg
(
ic
->
i_cm_id
->
device
,
op
->
r_sg
,
op
->
r_nents
,
(
op
->
r
_write
)
?
DMA_TO_DEVICE
:
DMA_FROM_DEVICE
);
rdsdebug
(
"ic %p mapping op %p: %d
\n
"
,
ic
,
op
,
op
->
r
_count
);
if
(
op
->
r
_count
==
0
)
{
if
(
!
op
->
op
_mapped
)
{
op
->
op
_count
=
ib_dma_map_sg
(
ic
->
i_cm_id
->
device
,
op
->
op_sg
,
op
->
op_nents
,
(
op
->
op
_write
)
?
DMA_TO_DEVICE
:
DMA_FROM_DEVICE
);
rdsdebug
(
"ic %p mapping op %p: %d
\n
"
,
ic
,
op
,
op
->
op
_count
);
if
(
op
->
op
_count
==
0
)
{
rds_iw_stats_inc
(
s_iw_tx_sg_mapping_failure
);
ret
=
-
ENOMEM
;
/* XXX ? */
goto
out
;
}
op
->
r
_mapped
=
1
;
op
->
op
_mapped
=
1
;
}
if
(
!
op
->
r
_write
)
{
if
(
!
op
->
op
_write
)
{
/* Alloc space on the send queue for the fastreg */
work_alloc
=
rds_iw_ring_alloc
(
&
ic
->
i_send_ring
,
1
,
&
fr_pos
);
if
(
work_alloc
!=
1
)
{
...
...
@@ -835,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
* Instead of knowing how to return a partial rdma read/write we insist that there
* be enough work requests to send the entire message.
*/
i
=
ceil
(
op
->
r
_count
,
rds_iwdev
->
max_sge
);
i
=
ceil
(
op
->
op
_count
,
rds_iwdev
->
max_sge
);
work_alloc
=
rds_iw_ring_alloc
(
&
ic
->
i_send_ring
,
i
,
&
pos
);
if
(
work_alloc
!=
i
)
{
...
...
@@ -846,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
}
send
=
&
ic
->
i_sends
[
pos
];
if
(
!
op
->
r
_write
)
{
if
(
!
op
->
op
_write
)
{
first
=
prev
=
&
ic
->
i_sends
[
fr_pos
];
}
else
{
first
=
send
;
prev
=
NULL
;
}
scat
=
&
op
->
r
_sg
[
0
];
scat
=
&
op
->
op
_sg
[
0
];
sent
=
0
;
num_sge
=
op
->
r
_count
;
num_sge
=
op
->
op
_count
;
for
(
i
=
0
;
i
<
work_alloc
&&
scat
!=
&
op
->
r_sg
[
op
->
r
_count
];
i
++
)
{
for
(
i
=
0
;
i
<
work_alloc
&&
scat
!=
&
op
->
op_sg
[
op
->
op
_count
];
i
++
)
{
send
->
s_wr
.
send_flags
=
0
;
send
->
s_queued
=
jiffies
;
...
...
@@ -873,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
* for local access after RDS is finished with it, using
* IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
*/
if
(
op
->
r
_write
)
if
(
op
->
op
_write
)
send
->
s_wr
.
opcode
=
IB_WR_RDMA_WRITE
;
else
send
->
s_wr
.
opcode
=
IB_WR_RDMA_READ_WITH_INV
;
send
->
s_wr
.
wr
.
rdma
.
remote_addr
=
remote_addr
;
send
->
s_wr
.
wr
.
rdma
.
rkey
=
op
->
r_
key
;
send
->
s_wr
.
wr
.
rdma
.
rkey
=
op
->
op_r
key
;
send
->
s_op
=
op
;
if
(
num_sge
>
rds_iwdev
->
max_sge
)
{
...
...
@@ -893,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
if
(
prev
)
prev
->
s_wr
.
next
=
&
send
->
s_wr
;
for
(
j
=
0
;
j
<
send
->
s_wr
.
num_sge
&&
scat
!=
&
op
->
r_sg
[
op
->
r
_count
];
j
++
)
{
for
(
j
=
0
;
j
<
send
->
s_wr
.
num_sge
&&
scat
!=
&
op
->
op_sg
[
op
->
op
_count
];
j
++
)
{
len
=
ib_sg_dma_len
(
ic
->
i_cm_id
->
device
,
scat
);
if
(
send
->
s_wr
.
opcode
==
IB_WR_RDMA_READ_WITH_INV
)
...
...
@@ -927,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
}
/* if we finished the message then send completion owns it */
if
(
scat
==
&
op
->
r_sg
[
op
->
r
_count
])
if
(
scat
==
&
op
->
op_sg
[
op
->
op
_count
])
first
->
s_wr
.
send_flags
=
IB_SEND_SIGNALED
;
if
(
i
<
work_alloc
)
{
...
...
@@ -941,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
* adapters do not allow using the lkey for this at all. To bypass this use a
* fastreg_mr (or possibly a dma_mr)
*/
if
(
!
op
->
r
_write
)
{
if
(
!
op
->
op
_write
)
{
rds_iw_build_send_fastreg
(
rds_iwdev
,
ic
,
&
ic
->
i_sends
[
fr_pos
],
op
->
r
_count
,
sent
,
conn
->
c_xmit_rm
->
m_rs
->
rs_user_addr
);
op
->
op
_count
,
sent
,
conn
->
c_xmit_rm
->
m_rs
->
rs_user_addr
);
work_alloc
++
;
}
...
...
net/rds/iw_sysctl.c
View file @
cf0ac2b8
...
...
@@ -122,10 +122,10 @@ void rds_iw_sysctl_exit(void)
unregister_sysctl_table
(
rds_iw_sysctl_hdr
);
}
int
__init
rds_iw_sysctl_init
(
void
)
int
rds_iw_sysctl_init
(
void
)
{
rds_iw_sysctl_hdr
=
register_sysctl_paths
(
rds_iw_sysctl_path
,
rds_iw_sysctl_table
);
if
(
rds_iw_sysctl_hdr
==
NULL
)
if
(
!
rds_iw_sysctl_hdr
)
return
-
ENOMEM
;
return
0
;
}
net/rds/loop.c
View file @
cf0ac2b8
...
...
@@ -61,10 +61,17 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned
int
hdr_off
,
unsigned
int
sg
,
unsigned
int
off
)
{
/* Do not send cong updates to loopback */
if
(
rm
->
m_inc
.
i_hdr
.
h_flags
&
RDS_FLAG_CONG_BITMAP
)
{
rds_cong_map_updated
(
conn
->
c_fcong
,
~
(
u64
)
0
);
return
sizeof
(
struct
rds_header
)
+
RDS_CONG_MAP_BYTES
;
}
BUG_ON
(
hdr_off
||
sg
||
off
);
rds_inc_init
(
&
rm
->
m_inc
,
conn
,
conn
->
c_laddr
);
rds_message_addref
(
rm
);
/* for the inc */
/* For the embedded inc. Matching put is in loop_inc_free() */
rds_message_addref
(
rm
);
rds_recv_incoming
(
conn
,
conn
->
c_laddr
,
conn
->
c_faddr
,
&
rm
->
m_inc
,
GFP_KERNEL
,
KM_USER0
);
...
...
@@ -77,16 +84,14 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
return
sizeof
(
struct
rds_header
)
+
be32_to_cpu
(
rm
->
m_inc
.
i_hdr
.
h_len
);
}
static
int
rds_loop_xmit_cong_map
(
struct
rds_connection
*
conn
,
struct
rds_cong_map
*
map
,
unsigned
long
offset
)
/*
* See rds_loop_xmit(). Since our inc is embedded in the rm, we
* make sure the rm lives at least until the inc is done.
*/
static
void
rds_loop_inc_free
(
struct
rds_incoming
*
inc
)
{
BUG_ON
(
offset
);
BUG_ON
(
map
!=
conn
->
c_lcong
);
rds_cong_map_updated
(
conn
->
c_fcong
,
~
(
u64
)
0
);
return
sizeof
(
struct
rds_header
)
+
RDS_CONG_MAP_BYTES
;
struct
rds_message
*
rm
=
container_of
(
inc
,
struct
rds_message
,
m_inc
);
rds_message_put
(
rm
);
}
/* we need to at least give the thread something to succeed */
...
...
@@ -112,7 +117,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
unsigned
long
flags
;
lc
=
kzalloc
(
sizeof
(
struct
rds_loop_connection
),
GFP_KERNEL
);
if
(
lc
==
NULL
)
if
(
!
lc
)
return
-
ENOMEM
;
INIT_LIST_HEAD
(
&
lc
->
loop_node
);
...
...
@@ -169,14 +174,12 @@ void rds_loop_exit(void)
*/
struct
rds_transport
rds_loop_transport
=
{
.
xmit
=
rds_loop_xmit
,
.
xmit_cong_map
=
rds_loop_xmit_cong_map
,
.
recv
=
rds_loop_recv
,
.
conn_alloc
=
rds_loop_conn_alloc
,
.
conn_free
=
rds_loop_conn_free
,
.
conn_connect
=
rds_loop_conn_connect
,
.
conn_shutdown
=
rds_loop_conn_shutdown
,
.
inc_copy_to_user
=
rds_message_inc_copy_to_user
,
.
inc_purge
=
rds_message_inc_purge
,
.
inc_free
=
rds_message_inc_free
,
.
inc_free
=
rds_loop_inc_free
,
.
t_name
=
"loopback"
,
};
net/rds/message.c
View file @
cf0ac2b8
...
...
@@ -34,9 +34,6 @@
#include <linux/slab.h>
#include "rds.h"
#include "rdma.h"
static
DECLARE_WAIT_QUEUE_HEAD
(
rds_message_flush_waitq
);
static
unsigned
int
rds_exthdr_size
[
__RDS_EXTHDR_MAX
]
=
{
[
RDS_EXTHDR_NONE
]
=
0
,
...
...
@@ -63,29 +60,31 @@ static void rds_message_purge(struct rds_message *rm)
if
(
unlikely
(
test_bit
(
RDS_MSG_PAGEVEC
,
&
rm
->
m_flags
)))
return
;
for
(
i
=
0
;
i
<
rm
->
m
_nents
;
i
++
)
{
rdsdebug
(
"putting data page %p
\n
"
,
(
void
*
)
sg_page
(
&
rm
->
m
_sg
[
i
]));
for
(
i
=
0
;
i
<
rm
->
data
.
op
_nents
;
i
++
)
{
rdsdebug
(
"putting data page %p
\n
"
,
(
void
*
)
sg_page
(
&
rm
->
data
.
op
_sg
[
i
]));
/* XXX will have to put_page for page refs */
__free_page
(
sg_page
(
&
rm
->
m
_sg
[
i
]));
__free_page
(
sg_page
(
&
rm
->
data
.
op
_sg
[
i
]));
}
rm
->
m
_nents
=
0
;
rm
->
data
.
op
_nents
=
0
;
if
(
rm
->
m_rdma_op
)
rds_rdma_free_op
(
rm
->
m_rdma_op
);
if
(
rm
->
m_rdma_mr
)
rds_mr_put
(
rm
->
m_rdma_mr
);
}
if
(
rm
->
rdma
.
op_active
)
rds_rdma_free_op
(
&
rm
->
rdma
);
if
(
rm
->
rdma
.
op_rdma_mr
)
rds_mr_put
(
rm
->
rdma
.
op_rdma_mr
);
void
rds_message_inc_purge
(
struct
rds_incoming
*
inc
)
{
struct
rds_message
*
rm
=
container_of
(
inc
,
struct
rds_message
,
m_inc
);
rds_message_purge
(
rm
);
if
(
rm
->
atomic
.
op_active
)
rds_atomic_free_op
(
&
rm
->
atomic
);
if
(
rm
->
atomic
.
op_rdma_mr
)
rds_mr_put
(
rm
->
atomic
.
op_rdma_mr
);
}
void
rds_message_put
(
struct
rds_message
*
rm
)
{
rdsdebug
(
"put rm %p ref %d
\n
"
,
rm
,
atomic_read
(
&
rm
->
m_refcount
));
if
(
atomic_read
(
&
rm
->
m_refcount
)
==
0
)
{
printk
(
KERN_CRIT
"danger refcount zero on %p
\n
"
,
rm
);
WARN_ON
(
1
);
}
if
(
atomic_dec_and_test
(
&
rm
->
m_refcount
))
{
BUG_ON
(
!
list_empty
(
&
rm
->
m_sock_item
));
BUG_ON
(
!
list_empty
(
&
rm
->
m_conn_item
));
...
...
@@ -96,12 +95,6 @@ void rds_message_put(struct rds_message *rm)
}
EXPORT_SYMBOL_GPL
(
rds_message_put
);
void
rds_message_inc_free
(
struct
rds_incoming
*
inc
)
{
struct
rds_message
*
rm
=
container_of
(
inc
,
struct
rds_message
,
m_inc
);
rds_message_put
(
rm
);
}
void
rds_message_populate_header
(
struct
rds_header
*
hdr
,
__be16
sport
,
__be16
dport
,
u64
seq
)
{
...
...
@@ -214,41 +207,68 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
}
EXPORT_SYMBOL_GPL
(
rds_message_add_rdma_dest_extension
);
struct
rds_message
*
rds_message_alloc
(
unsigned
int
nents
,
gfp_t
gfp
)
/*
* Each rds_message is allocated with extra space for the scatterlist entries
* rds ops will need. This is to minimize memory allocation count. Then, each rds op
* can grab SGs when initializing its part of the rds_message.
*/
struct
rds_message
*
rds_message_alloc
(
unsigned
int
extra_len
,
gfp_t
gfp
)
{
struct
rds_message
*
rm
;
rm
=
kzalloc
(
sizeof
(
struct
rds_message
)
+
(
nents
*
sizeof
(
struct
scatterlist
)),
gfp
);
rm
=
kzalloc
(
sizeof
(
struct
rds_message
)
+
extra_len
,
gfp
);
if
(
!
rm
)
goto
out
;
if
(
nents
)
sg_init_table
(
rm
->
m_sg
,
nents
);
rm
->
m_used_sgs
=
0
;
rm
->
m_total_sgs
=
extra_len
/
sizeof
(
struct
scatterlist
);
atomic_set
(
&
rm
->
m_refcount
,
1
);
INIT_LIST_HEAD
(
&
rm
->
m_sock_item
);
INIT_LIST_HEAD
(
&
rm
->
m_conn_item
);
spin_lock_init
(
&
rm
->
m_rs_lock
);
init_waitqueue_head
(
&
rm
->
m_flush_wait
);
out:
return
rm
;
}
/*
* RDS ops use this to grab SG entries from the rm's sg pool.
*/
struct
scatterlist
*
rds_message_alloc_sgs
(
struct
rds_message
*
rm
,
int
nents
)
{
struct
scatterlist
*
sg_first
=
(
struct
scatterlist
*
)
&
rm
[
1
];
struct
scatterlist
*
sg_ret
;
WARN_ON
(
rm
->
m_used_sgs
+
nents
>
rm
->
m_total_sgs
);
WARN_ON
(
!
nents
);
sg_ret
=
&
sg_first
[
rm
->
m_used_sgs
];
sg_init_table
(
sg_ret
,
nents
);
rm
->
m_used_sgs
+=
nents
;
return
sg_ret
;
}
struct
rds_message
*
rds_message_map_pages
(
unsigned
long
*
page_addrs
,
unsigned
int
total_len
)
{
struct
rds_message
*
rm
;
unsigned
int
i
;
int
num_sgs
=
ceil
(
total_len
,
PAGE_SIZE
);
int
extra_bytes
=
num_sgs
*
sizeof
(
struct
scatterlist
);
rm
=
rds_message_alloc
(
ceil
(
total_len
,
PAGE_SIZE
),
GFP_KERNEL
);
if
(
rm
==
NULL
)
rm
=
rds_message_alloc
(
extra_bytes
,
GFP_NOWAIT
);
if
(
!
rm
)
return
ERR_PTR
(
-
ENOMEM
);
set_bit
(
RDS_MSG_PAGEVEC
,
&
rm
->
m_flags
);
rm
->
m_inc
.
i_hdr
.
h_len
=
cpu_to_be32
(
total_len
);
rm
->
m_nents
=
ceil
(
total_len
,
PAGE_SIZE
);
rm
->
data
.
op_nents
=
ceil
(
total_len
,
PAGE_SIZE
);
rm
->
data
.
op_sg
=
rds_message_alloc_sgs
(
rm
,
num_sgs
);
for
(
i
=
0
;
i
<
rm
->
m
_nents
;
++
i
)
{
sg_set_page
(
&
rm
->
m
_sg
[
i
],
for
(
i
=
0
;
i
<
rm
->
data
.
op
_nents
;
++
i
)
{
sg_set_page
(
&
rm
->
data
.
op
_sg
[
i
],
virt_to_page
(
page_addrs
[
i
]),
PAGE_SIZE
,
0
);
}
...
...
@@ -256,40 +276,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
return
rm
;
}
struct
rds_message
*
rds_message_copy_from_user
(
struct
iovec
*
first_iov
,
int
rds_message_copy_from_user
(
struct
rds_message
*
rm
,
struct
iovec
*
first_iov
,
size_t
total_len
)
{
unsigned
long
to_copy
;
unsigned
long
iov_off
;
unsigned
long
sg_off
;
struct
rds_message
*
rm
;
struct
iovec
*
iov
;
struct
scatterlist
*
sg
;
int
ret
;
rm
=
rds_message_alloc
(
ceil
(
total_len
,
PAGE_SIZE
),
GFP_KERNEL
);
if
(
rm
==
NULL
)
{
ret
=
-
ENOMEM
;
goto
out
;
}
int
ret
=
0
;
rm
->
m_inc
.
i_hdr
.
h_len
=
cpu_to_be32
(
total_len
);
/*
* now allocate and copy in the data payload.
*/
sg
=
rm
->
m
_sg
;
sg
=
rm
->
data
.
op
_sg
;
iov
=
first_iov
;
iov_off
=
0
;
sg_off
=
0
;
/* Dear gcc, sg->page will be null from kzalloc. */
while
(
total_len
)
{
if
(
sg_page
(
sg
)
==
NULL
)
{
if
(
!
sg_page
(
sg
)
)
{
ret
=
rds_page_remainder_alloc
(
sg
,
total_len
,
GFP_HIGHUSER
);
if
(
ret
)
goto
out
;
rm
->
m
_nents
++
;
rm
->
data
.
op
_nents
++
;
sg_off
=
0
;
}
...
...
@@ -320,14 +333,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
sg
++
;
}
ret
=
0
;
out:
if
(
ret
)
{
if
(
rm
)
rds_message_put
(
rm
);
rm
=
ERR_PTR
(
ret
);
}
return
rm
;
return
ret
;
}
int
rds_message_inc_copy_to_user
(
struct
rds_incoming
*
inc
,
...
...
@@ -348,7 +355,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
iov
=
first_iov
;
iov_off
=
0
;
sg
=
rm
->
m
_sg
;
sg
=
rm
->
data
.
op
_sg
;
vec_off
=
0
;
copied
=
0
;
...
...
@@ -394,15 +401,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc,
*/
void
rds_message_wait
(
struct
rds_message
*
rm
)
{
wait_event
(
rds_message_flush_waitq
,
wait_event
_interruptible
(
rm
->
m_flush_wait
,
!
test_bit
(
RDS_MSG_MAPPED
,
&
rm
->
m_flags
));
}
void
rds_message_unmapped
(
struct
rds_message
*
rm
)
{
clear_bit
(
RDS_MSG_MAPPED
,
&
rm
->
m_flags
);
if
(
waitqueue_active
(
&
rds_message_flush_waitq
))
wake_up
(
&
rds_message_flush_waitq
);
wake_up_interruptible
(
&
rm
->
m_flush_wait
);
}
EXPORT_SYMBOL_GPL
(
rds_message_unmapped
);
net/rds/page.c
View file @
cf0ac2b8
...
...
@@ -116,7 +116,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
/* jump straight to allocation if we're trying for a huge page */
if
(
bytes
>=
PAGE_SIZE
)
{
page
=
alloc_page
(
gfp
);
if
(
page
==
NULL
)
{
if
(
!
page
)
{
ret
=
-
ENOMEM
;
}
else
{
sg_set_page
(
scat
,
page
,
PAGE_SIZE
,
0
);
...
...
@@ -162,7 +162,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
rem
=
&
per_cpu
(
rds_page_remainders
,
get_cpu
());
local_irq_save
(
flags
);
if
(
page
==
NULL
)
{
if
(
!
page
)
{
ret
=
-
ENOMEM
;
break
;
}
...
...
@@ -186,6 +186,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
ret
?
0
:
scat
->
length
);
return
ret
;
}
EXPORT_SYMBOL_GPL
(
rds_page_remainder_alloc
);
static
int
rds_page_remainder_cpu_notify
(
struct
notifier_block
*
self
,
unsigned
long
action
,
void
*
hcpu
)
...
...
net/rds/rdma.c
View file @
cf0ac2b8
...
...
@@ -35,7 +35,7 @@
#include <linux/rbtree.h>
#include <linux/dma-mapping.h>
/* for DMA_*_DEVICE */
#include "rd
ma
.h"
#include "rd
s
.h"
/*
* XXX
...
...
@@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
{
struct
rds_mr
*
mr
;
struct
rb_node
*
node
;
unsigned
long
flags
;
/* Release any MRs associated with this socket */
spin_lock_irqsave
(
&
rs
->
rs_rdma_lock
,
flags
);
while
((
node
=
rb_first
(
&
rs
->
rs_rdma_keys
)))
{
mr
=
container_of
(
node
,
struct
rds_mr
,
r_rb_node
);
if
(
mr
->
r_trans
==
rs
->
rs_transport
)
mr
->
r_invalidate
=
0
;
rb_erase
(
&
mr
->
r_rb_node
,
&
rs
->
rs_rdma_keys
);
RB_CLEAR_NODE
(
&
mr
->
r_rb_node
);
spin_unlock_irqrestore
(
&
rs
->
rs_rdma_lock
,
flags
);
rds_destroy_mr
(
mr
);
rds_mr_put
(
mr
);
spin_lock_irqsave
(
&
rs
->
rs_rdma_lock
,
flags
);
}
spin_unlock_irqrestore
(
&
rs
->
rs_rdma_lock
,
flags
);
if
(
rs
->
rs_transport
&&
rs
->
rs_transport
->
flush_mrs
)
rs
->
rs_transport
->
flush_mrs
();
...
...
@@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto
out
;
}
if
(
rs
->
rs_transport
->
get_mr
==
NULL
)
{
if
(
!
rs
->
rs_transport
->
get_mr
)
{
ret
=
-
EOPNOTSUPP
;
goto
out
;
}
...
...
@@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
/* XXX clamp nr_pages to limit the size of this alloc? */
pages
=
kcalloc
(
nr_pages
,
sizeof
(
struct
page
*
),
GFP_KERNEL
);
if
(
pages
==
NULL
)
{
if
(
!
pages
)
{
ret
=
-
ENOMEM
;
goto
out
;
}
mr
=
kzalloc
(
sizeof
(
struct
rds_mr
),
GFP_KERNEL
);
if
(
mr
==
NULL
)
{
if
(
!
mr
)
{
ret
=
-
ENOMEM
;
goto
out
;
}
...
...
@@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
* the zero page.
*/
ret
=
rds_pin_pages
(
args
->
vec
.
addr
&
PAGE_MASK
,
nr_pages
,
pages
,
1
);
ret
=
rds_pin_pages
(
args
->
vec
.
addr
,
nr_pages
,
pages
,
1
);
if
(
ret
<
0
)
goto
out
;
nents
=
ret
;
sg
=
kcalloc
(
nents
,
sizeof
(
*
sg
),
GFP_KERNEL
);
if
(
sg
==
NULL
)
{
if
(
!
sg
)
{
ret
=
-
ENOMEM
;
goto
out
;
}
...
...
@@ -406,68 +414,127 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
spin_lock_irqsave
(
&
rs
->
rs_rdma_lock
,
flags
);
mr
=
rds_mr_tree_walk
(
&
rs
->
rs_rdma_keys
,
r_key
,
NULL
);
if
(
mr
&&
(
mr
->
r_use_once
||
force
))
{
if
(
!
mr
)
{
printk
(
KERN_ERR
"rds: trying to unuse MR with unknown r_key %u!
\n
"
,
r_key
);
spin_unlock_irqrestore
(
&
rs
->
rs_rdma_lock
,
flags
);
return
;
}
if
(
mr
->
r_use_once
||
force
)
{
rb_erase
(
&
mr
->
r_rb_node
,
&
rs
->
rs_rdma_keys
);
RB_CLEAR_NODE
(
&
mr
->
r_rb_node
);
zot_me
=
1
;
}
else
if
(
mr
)
atomic_inc
(
&
mr
->
r_refcount
);
}
spin_unlock_irqrestore
(
&
rs
->
rs_rdma_lock
,
flags
);
/* May have to issue a dma_sync on this memory region.
* Note we could avoid this if the operation was a RDMA READ,
* but at this point we can't tell. */
if
(
mr
!=
NULL
)
{
if
(
mr
->
r_trans
->
sync_mr
)
mr
->
r_trans
->
sync_mr
(
mr
->
r_trans_private
,
DMA_FROM_DEVICE
);
/* If the MR was marked as invalidate, this will
* trigger an async flush. */
if
(
zot_me
)
rds_destroy_mr
(
mr
);
rds_mr_put
(
mr
);
}
if
(
mr
->
r_trans
->
sync_mr
)
mr
->
r_trans
->
sync_mr
(
mr
->
r_trans_private
,
DMA_FROM_DEVICE
);
/* If the MR was marked as invalidate, this will
* trigger an async flush. */
if
(
zot_me
)
rds_destroy_mr
(
mr
);
rds_mr_put
(
mr
);
}
void
rds_rdma_free_op
(
struct
r
ds
_rdma_op
*
ro
)
void
rds_rdma_free_op
(
struct
r
m
_rdma_op
*
ro
)
{
unsigned
int
i
;
for
(
i
=
0
;
i
<
ro
->
r
_nents
;
i
++
)
{
struct
page
*
page
=
sg_page
(
&
ro
->
r
_sg
[
i
]);
for
(
i
=
0
;
i
<
ro
->
op
_nents
;
i
++
)
{
struct
page
*
page
=
sg_page
(
&
ro
->
op
_sg
[
i
]);
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory */
if
(
!
ro
->
r
_write
)
{
BUG_ON
(
i
n_interrupt
());
if
(
!
ro
->
op
_write
)
{
BUG_ON
(
i
rqs_disabled
());
set_page_dirty
(
page
);
}
put_page
(
page
);
}
kfree
(
ro
->
r_notifier
);
kfree
(
ro
);
kfree
(
ro
->
op_notifier
);
ro
->
op_notifier
=
NULL
;
ro
->
op_active
=
0
;
}
void
rds_atomic_free_op
(
struct
rm_atomic_op
*
ao
)
{
struct
page
*
page
=
sg_page
(
ao
->
op_sg
);
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory */
set_page_dirty
(
page
);
put_page
(
page
);
kfree
(
ao
->
op_notifier
);
ao
->
op_notifier
=
NULL
;
ao
->
op_active
=
0
;
}
/*
* Count the number of pages needed to describe an incoming iovec.
*/
static
int
rds_rdma_pages
(
struct
rds_rdma_args
*
args
)
{
struct
rds_iovec
vec
;
struct
rds_iovec
__user
*
local_vec
;
unsigned
int
tot_pages
=
0
;
unsigned
int
nr_pages
;
unsigned
int
i
;
local_vec
=
(
struct
rds_iovec
__user
*
)(
unsigned
long
)
args
->
local_vec_addr
;
/* figure out the number of pages in the vector */
for
(
i
=
0
;
i
<
args
->
nr_local
;
i
++
)
{
if
(
copy_from_user
(
&
vec
,
&
local_vec
[
i
],
sizeof
(
struct
rds_iovec
)))
return
-
EFAULT
;
nr_pages
=
rds_pages_in_vec
(
&
vec
);
if
(
nr_pages
==
0
)
return
-
EINVAL
;
tot_pages
+=
nr_pages
;
}
return
tot_pages
;
}
int
rds_rdma_extra_size
(
struct
rds_rdma_args
*
args
)
{
return
rds_rdma_pages
(
args
)
*
sizeof
(
struct
scatterlist
);
}
/*
* args is a pointer to an in-kernel copy in the sendmsg cmsg.
* The application asks for a RDMA transfer.
* Extract all arguments and set up the rdma_op
*/
static
struct
rds_rdma_op
*
rds_rdma_prepare
(
struct
rds_sock
*
rs
,
struct
rds_rdma_args
*
args
)
int
rds_cmsg_rdma_args
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
)
{
struct
rds_rdma_args
*
args
;
struct
rds_iovec
vec
;
struct
r
ds_rdma_op
*
op
=
NULL
;
struct
r
m_rdma_op
*
op
=
&
rm
->
rdma
;
unsigned
int
nr_pages
;
unsigned
int
max_pages
;
unsigned
int
nr_bytes
;
struct
page
**
pages
=
NULL
;
struct
rds_iovec
__user
*
local_vec
;
struct
scatterlist
*
sg
;
unsigned
int
nr
;
unsigned
int
i
,
j
;
int
ret
;
int
ret
=
0
;
if
(
cmsg
->
cmsg_len
<
CMSG_LEN
(
sizeof
(
struct
rds_rdma_args
))
||
rm
->
rdma
.
op_active
)
return
-
EINVAL
;
args
=
CMSG_DATA
(
cmsg
);
if
(
rs
->
rs_bound_addr
==
0
)
{
ret
=
-
ENOTCONN
;
/* XXX not a great errno */
...
...
@@ -479,61 +546,38 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
goto
out
;
}
nr_pages
=
0
;
max_pages
=
0
;
local_vec
=
(
struct
rds_iovec
__user
*
)(
unsigned
long
)
args
->
local_vec_addr
;
/* figure out the number of pages in the vector */
for
(
i
=
0
;
i
<
args
->
nr_local
;
i
++
)
{
if
(
copy_from_user
(
&
vec
,
&
local_vec
[
i
],
sizeof
(
struct
rds_iovec
)))
{
ret
=
-
EFAULT
;
goto
out
;
}
nr
=
rds_pages_in_vec
(
&
vec
);
if
(
nr
==
0
)
{
ret
=
-
EINVAL
;
goto
out
;
}
max_pages
=
max
(
nr
,
max_pages
);
nr_pages
+=
nr
;
}
pages
=
kcalloc
(
max_pages
,
sizeof
(
struct
page
*
),
GFP_KERNEL
);
if
(
pages
==
NULL
)
{
ret
=
-
ENOMEM
;
nr_pages
=
rds_rdma_pages
(
args
);
if
(
nr_pages
<
0
)
goto
out
;
}
op
=
kzalloc
(
offsetof
(
struct
rds_rdma_op
,
r_sg
[
nr_pages
]
),
GFP_KERNEL
);
if
(
op
==
NULL
)
{
pages
=
kcalloc
(
nr_pages
,
sizeof
(
struct
page
*
),
GFP_KERNEL
);
if
(
!
pages
)
{
ret
=
-
ENOMEM
;
goto
out
;
}
op
->
r_write
=
!!
(
args
->
flags
&
RDS_RDMA_READWRITE
);
op
->
r_fence
=
!!
(
args
->
flags
&
RDS_RDMA_FENCE
);
op
->
r_notify
=
!!
(
args
->
flags
&
RDS_RDMA_NOTIFY_ME
);
op
->
r_recverr
=
rs
->
rs_recverr
;
op
->
op_write
=
!!
(
args
->
flags
&
RDS_RDMA_READWRITE
);
op
->
op_fence
=
!!
(
args
->
flags
&
RDS_RDMA_FENCE
);
op
->
op_notify
=
!!
(
args
->
flags
&
RDS_RDMA_NOTIFY_ME
);
op
->
op_silent
=
!!
(
args
->
flags
&
RDS_RDMA_SILENT
);
op
->
op_active
=
1
;
op
->
op_recverr
=
rs
->
rs_recverr
;
WARN_ON
(
!
nr_pages
);
sg_init_table
(
op
->
r_sg
,
nr_pages
);
op
->
op_sg
=
rds_message_alloc_sgs
(
rm
,
nr_pages
);
if
(
op
->
r_notify
||
op
->
r
_recverr
)
{
if
(
op
->
op_notify
||
op
->
op
_recverr
)
{
/* We allocate an uninitialized notifier here, because
* we don't want to do that in the completion handler. We
* would have to use GFP_ATOMIC there, and don't want to deal
* with failed allocations.
*/
op
->
r
_notifier
=
kmalloc
(
sizeof
(
struct
rds_notifier
),
GFP_KERNEL
);
if
(
!
op
->
r
_notifier
)
{
op
->
op
_notifier
=
kmalloc
(
sizeof
(
struct
rds_notifier
),
GFP_KERNEL
);
if
(
!
op
->
op
_notifier
)
{
ret
=
-
ENOMEM
;
goto
out
;
}
op
->
r
_notifier
->
n_user_token
=
args
->
user_token
;
op
->
r
_notifier
->
n_status
=
RDS_RDMA_SUCCESS
;
op
->
op
_notifier
->
n_user_token
=
args
->
user_token
;
op
->
op
_notifier
->
n_status
=
RDS_RDMA_SUCCESS
;
}
/* The cookie contains the R_Key of the remote memory region, and
...
...
@@ -543,15 +587,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
* destination address (which is really an offset into the MR)
* FIXME: We may want to move this into ib_rdma.c
*/
op
->
r_
key
=
rds_rdma_cookie_key
(
args
->
cookie
);
op
->
r
_remote_addr
=
args
->
remote_vec
.
addr
+
rds_rdma_cookie_offset
(
args
->
cookie
);
op
->
op_r
key
=
rds_rdma_cookie_key
(
args
->
cookie
);
op
->
op
_remote_addr
=
args
->
remote_vec
.
addr
+
rds_rdma_cookie_offset
(
args
->
cookie
);
nr_bytes
=
0
;
rdsdebug
(
"RDS: rdma prepare nr_local %llu rva %llx rkey %x
\n
"
,
(
unsigned
long
long
)
args
->
nr_local
,
(
unsigned
long
long
)
args
->
remote_vec
.
addr
,
op
->
r_key
);
op
->
op_rkey
);
local_vec
=
(
struct
rds_iovec
__user
*
)(
unsigned
long
)
args
->
local_vec_addr
;
for
(
i
=
0
;
i
<
args
->
nr_local
;
i
++
)
{
if
(
copy_from_user
(
&
vec
,
&
local_vec
[
i
],
...
...
@@ -569,15 +615,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
rs
->
rs_user_addr
=
vec
.
addr
;
rs
->
rs_user_bytes
=
vec
.
bytes
;
/* did the user change the vec under us? */
if
(
nr
>
max_pages
||
op
->
r_nents
+
nr
>
nr_pages
)
{
ret
=
-
EINVAL
;
goto
out
;
}
/* If it's a WRITE operation, we want to pin the pages for reading.
* If it's a READ operation, we need to pin the pages for writing.
*/
ret
=
rds_pin_pages
(
vec
.
addr
&
PAGE_MASK
,
nr
,
pages
,
!
op
->
r
_write
);
ret
=
rds_pin_pages
(
vec
.
addr
,
nr
,
pages
,
!
op
->
op
_write
);
if
(
ret
<
0
)
goto
out
;
...
...
@@ -588,8 +629,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
for
(
j
=
0
;
j
<
nr
;
j
++
)
{
unsigned
int
offset
=
vec
.
addr
&
~
PAGE_MASK
;
struct
scatterlist
*
sg
;
sg
=
&
op
->
r_sg
[
op
->
r
_nents
+
j
];
sg
=
&
op
->
op_sg
[
op
->
op
_nents
+
j
];
sg_set_page
(
sg
,
pages
[
j
],
min_t
(
unsigned
int
,
vec
.
bytes
,
PAGE_SIZE
-
offset
),
offset
);
...
...
@@ -601,10 +643,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
vec
.
bytes
-=
sg
->
length
;
}
op
->
r
_nents
+=
nr
;
op
->
op
_nents
+=
nr
;
}
if
(
nr_bytes
>
args
->
remote_vec
.
bytes
)
{
rdsdebug
(
"RDS nr_bytes %u remote_bytes %u do not match
\n
"
,
nr_bytes
,
...
...
@@ -612,38 +653,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
ret
=
-
EINVAL
;
goto
out
;
}
op
->
r
_bytes
=
nr_bytes
;
op
->
op
_bytes
=
nr_bytes
;
ret
=
0
;
out:
kfree
(
pages
);
if
(
ret
)
{
if
(
op
)
rds_rdma_free_op
(
op
);
op
=
ERR_PTR
(
ret
);
}
return
op
;
}
/*
* The application asks for a RDMA transfer.
* Extract all arguments and set up the rdma_op
*/
int
rds_cmsg_rdma_args
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
)
{
struct
rds_rdma_op
*
op
;
if
(
cmsg
->
cmsg_len
<
CMSG_LEN
(
sizeof
(
struct
rds_rdma_args
))
||
rm
->
m_rdma_op
!=
NULL
)
return
-
EINVAL
;
if
(
ret
)
rds_rdma_free_op
(
op
);
op
=
rds_rdma_prepare
(
rs
,
CMSG_DATA
(
cmsg
));
if
(
IS_ERR
(
op
))
return
PTR_ERR
(
op
);
rds_stats_inc
(
s_send_rdma
);
rm
->
m_rdma_op
=
op
;
return
0
;
return
ret
;
}
/*
...
...
@@ -673,7 +693,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
spin_lock_irqsave
(
&
rs
->
rs_rdma_lock
,
flags
);
mr
=
rds_mr_tree_walk
(
&
rs
->
rs_rdma_keys
,
r_key
,
NULL
);
if
(
mr
==
NULL
)
if
(
!
mr
)
err
=
-
EINVAL
;
/* invalid r_key */
else
atomic_inc
(
&
mr
->
r_refcount
);
...
...
@@ -681,7 +701,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
if
(
mr
)
{
mr
->
r_trans
->
sync_mr
(
mr
->
r_trans_private
,
DMA_TO_DEVICE
);
rm
->
m
_rdma_mr
=
mr
;
rm
->
rdma
.
op
_rdma_mr
=
mr
;
}
return
err
;
}
...
...
@@ -699,5 +719,98 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
rm
->
m_rdma_cookie
!=
0
)
return
-
EINVAL
;
return
__rds_rdma_map
(
rs
,
CMSG_DATA
(
cmsg
),
&
rm
->
m_rdma_cookie
,
&
rm
->
m_rdma_mr
);
return
__rds_rdma_map
(
rs
,
CMSG_DATA
(
cmsg
),
&
rm
->
m_rdma_cookie
,
&
rm
->
rdma
.
op_rdma_mr
);
}
/*
* Fill in rds_message for an atomic request.
*/
int
rds_cmsg_atomic
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
)
{
struct
page
*
page
=
NULL
;
struct
rds_atomic_args
*
args
;
int
ret
=
0
;
if
(
cmsg
->
cmsg_len
<
CMSG_LEN
(
sizeof
(
struct
rds_atomic_args
))
||
rm
->
atomic
.
op_active
)
return
-
EINVAL
;
args
=
CMSG_DATA
(
cmsg
);
/* Nonmasked & masked cmsg ops converted to masked hw ops */
switch
(
cmsg
->
cmsg_type
)
{
case
RDS_CMSG_ATOMIC_FADD
:
rm
->
atomic
.
op_type
=
RDS_ATOMIC_TYPE_FADD
;
rm
->
atomic
.
op_m_fadd
.
add
=
args
->
fadd
.
add
;
rm
->
atomic
.
op_m_fadd
.
nocarry_mask
=
0
;
break
;
case
RDS_CMSG_MASKED_ATOMIC_FADD
:
rm
->
atomic
.
op_type
=
RDS_ATOMIC_TYPE_FADD
;
rm
->
atomic
.
op_m_fadd
.
add
=
args
->
m_fadd
.
add
;
rm
->
atomic
.
op_m_fadd
.
nocarry_mask
=
args
->
m_fadd
.
nocarry_mask
;
break
;
case
RDS_CMSG_ATOMIC_CSWP
:
rm
->
atomic
.
op_type
=
RDS_ATOMIC_TYPE_CSWP
;
rm
->
atomic
.
op_m_cswp
.
compare
=
args
->
cswp
.
compare
;
rm
->
atomic
.
op_m_cswp
.
swap
=
args
->
cswp
.
swap
;
rm
->
atomic
.
op_m_cswp
.
compare_mask
=
~
0
;
rm
->
atomic
.
op_m_cswp
.
swap_mask
=
~
0
;
break
;
case
RDS_CMSG_MASKED_ATOMIC_CSWP
:
rm
->
atomic
.
op_type
=
RDS_ATOMIC_TYPE_CSWP
;
rm
->
atomic
.
op_m_cswp
.
compare
=
args
->
m_cswp
.
compare
;
rm
->
atomic
.
op_m_cswp
.
swap
=
args
->
m_cswp
.
swap
;
rm
->
atomic
.
op_m_cswp
.
compare_mask
=
args
->
m_cswp
.
compare_mask
;
rm
->
atomic
.
op_m_cswp
.
swap_mask
=
args
->
m_cswp
.
swap_mask
;
break
;
default:
BUG
();
/* should never happen */
}
rm
->
atomic
.
op_notify
=
!!
(
args
->
flags
&
RDS_RDMA_NOTIFY_ME
);
rm
->
atomic
.
op_silent
=
!!
(
args
->
flags
&
RDS_RDMA_SILENT
);
rm
->
atomic
.
op_active
=
1
;
rm
->
atomic
.
op_recverr
=
rs
->
rs_recverr
;
rm
->
atomic
.
op_sg
=
rds_message_alloc_sgs
(
rm
,
1
);
/* verify 8 byte-aligned */
if
(
args
->
local_addr
&
0x7
)
{
ret
=
-
EFAULT
;
goto
err
;
}
ret
=
rds_pin_pages
(
args
->
local_addr
,
1
,
&
page
,
1
);
if
(
ret
!=
1
)
goto
err
;
ret
=
0
;
sg_set_page
(
rm
->
atomic
.
op_sg
,
page
,
8
,
offset_in_page
(
args
->
local_addr
));
if
(
rm
->
atomic
.
op_notify
||
rm
->
atomic
.
op_recverr
)
{
/* We allocate an uninitialized notifier here, because
* we don't want to do that in the completion handler. We
* would have to use GFP_ATOMIC there, and don't want to deal
* with failed allocations.
*/
rm
->
atomic
.
op_notifier
=
kmalloc
(
sizeof
(
*
rm
->
atomic
.
op_notifier
),
GFP_KERNEL
);
if
(
!
rm
->
atomic
.
op_notifier
)
{
ret
=
-
ENOMEM
;
goto
err
;
}
rm
->
atomic
.
op_notifier
->
n_user_token
=
args
->
user_token
;
rm
->
atomic
.
op_notifier
->
n_status
=
RDS_RDMA_SUCCESS
;
}
rm
->
atomic
.
op_rkey
=
rds_rdma_cookie_key
(
args
->
cookie
);
rm
->
atomic
.
op_remote_addr
=
args
->
remote_addr
+
rds_rdma_cookie_offset
(
args
->
cookie
);
return
ret
;
err:
if
(
page
)
put_page
(
page
);
kfree
(
rm
->
atomic
.
op_notifier
);
return
ret
;
}
net/rds/rdma.h
deleted
100644 → 0
View file @
f27e21a8
#ifndef _RDS_RDMA_H
#define _RDS_RDMA_H
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/scatterlist.h>
#include "rds.h"
struct
rds_mr
{
struct
rb_node
r_rb_node
;
atomic_t
r_refcount
;
u32
r_key
;
/* A copy of the creation flags */
unsigned
int
r_use_once
:
1
;
unsigned
int
r_invalidate
:
1
;
unsigned
int
r_write
:
1
;
/* This is for RDS_MR_DEAD.
* It would be nice & consistent to make this part of the above
* bit field here, but we need to use test_and_set_bit.
*/
unsigned
long
r_state
;
struct
rds_sock
*
r_sock
;
/* back pointer to the socket that owns us */
struct
rds_transport
*
r_trans
;
void
*
r_trans_private
;
};
/* Flags for mr->r_state */
#define RDS_MR_DEAD 0
struct
rds_rdma_op
{
u32
r_key
;
u64
r_remote_addr
;
unsigned
int
r_write
:
1
;
unsigned
int
r_fence
:
1
;
unsigned
int
r_notify
:
1
;
unsigned
int
r_recverr
:
1
;
unsigned
int
r_mapped
:
1
;
struct
rds_notifier
*
r_notifier
;
unsigned
int
r_bytes
;
unsigned
int
r_nents
;
unsigned
int
r_count
;
struct
scatterlist
r_sg
[
0
];
};
static
inline
rds_rdma_cookie_t
rds_rdma_make_cookie
(
u32
r_key
,
u32
offset
)
{
return
r_key
|
(((
u64
)
offset
)
<<
32
);
}
static
inline
u32
rds_rdma_cookie_key
(
rds_rdma_cookie_t
cookie
)
{
return
cookie
;
}
static
inline
u32
rds_rdma_cookie_offset
(
rds_rdma_cookie_t
cookie
)
{
return
cookie
>>
32
;
}
int
rds_get_mr
(
struct
rds_sock
*
rs
,
char
__user
*
optval
,
int
optlen
);
int
rds_get_mr_for_dest
(
struct
rds_sock
*
rs
,
char
__user
*
optval
,
int
optlen
);
int
rds_free_mr
(
struct
rds_sock
*
rs
,
char
__user
*
optval
,
int
optlen
);
void
rds_rdma_drop_keys
(
struct
rds_sock
*
rs
);
int
rds_cmsg_rdma_args
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
);
int
rds_cmsg_rdma_dest
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
);
int
rds_cmsg_rdma_args
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
);
int
rds_cmsg_rdma_map
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
);
void
rds_rdma_free_op
(
struct
rds_rdma_op
*
ro
);
void
rds_rdma_send_complete
(
struct
rds_message
*
rm
,
int
);
extern
void
__rds_put_mr_final
(
struct
rds_mr
*
mr
);
static
inline
void
rds_mr_put
(
struct
rds_mr
*
mr
)
{
if
(
atomic_dec_and_test
(
&
mr
->
r_refcount
))
__rds_put_mr_final
(
mr
);
}
#endif
net/rds/rdma_transport.c
View file @
cf0ac2b8
...
...
@@ -36,6 +36,34 @@
static
struct
rdma_cm_id
*
rds_rdma_listen_id
;
static
char
*
rds_cm_event_strings
[]
=
{
#define RDS_CM_EVENT_STRING(foo) \
[RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
RDS_CM_EVENT_STRING
(
ADDR_RESOLVED
),
RDS_CM_EVENT_STRING
(
ADDR_ERROR
),
RDS_CM_EVENT_STRING
(
ROUTE_RESOLVED
),
RDS_CM_EVENT_STRING
(
ROUTE_ERROR
),
RDS_CM_EVENT_STRING
(
CONNECT_REQUEST
),
RDS_CM_EVENT_STRING
(
CONNECT_RESPONSE
),
RDS_CM_EVENT_STRING
(
CONNECT_ERROR
),
RDS_CM_EVENT_STRING
(
UNREACHABLE
),
RDS_CM_EVENT_STRING
(
REJECTED
),
RDS_CM_EVENT_STRING
(
ESTABLISHED
),
RDS_CM_EVENT_STRING
(
DISCONNECTED
),
RDS_CM_EVENT_STRING
(
DEVICE_REMOVAL
),
RDS_CM_EVENT_STRING
(
MULTICAST_JOIN
),
RDS_CM_EVENT_STRING
(
MULTICAST_ERROR
),
RDS_CM_EVENT_STRING
(
ADDR_CHANGE
),
RDS_CM_EVENT_STRING
(
TIMEWAIT_EXIT
),
#undef RDS_CM_EVENT_STRING
};
static
char
*
rds_cm_event_str
(
enum
rdma_cm_event_type
type
)
{
return
rds_str_array
(
rds_cm_event_strings
,
ARRAY_SIZE
(
rds_cm_event_strings
),
type
);
};
int
rds_rdma_cm_event_handler
(
struct
rdma_cm_id
*
cm_id
,
struct
rdma_cm_event
*
event
)
{
...
...
@@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct
rds_transport
*
trans
;
int
ret
=
0
;
rdsdebug
(
"conn %p id %p handling event %u
\n
"
,
conn
,
cm_id
,
event
->
event
);
rdsdebug
(
"conn %p id %p handling event %u
(%s)
\n
"
,
conn
,
cm_id
,
event
->
event
,
rds_cm_event_str
(
event
->
event
)
);
if
(
cm_id
->
device
->
node_type
==
RDMA_NODE_RNIC
)
trans
=
&
rds_iw_transport
;
...
...
@@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
default:
/* things like device disconnect? */
printk
(
KERN_ERR
"RDS: unknown event %u!
\n
"
,
event
->
event
);
printk
(
KERN_ERR
"RDS: unknown event %u (%s)!
\n
"
,
event
->
event
,
rds_cm_event_str
(
event
->
event
));
break
;
}
...
...
@@ -117,12 +146,13 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
if
(
conn
)
mutex_unlock
(
&
conn
->
c_cm_lock
);
rdsdebug
(
"id %p event %u handling ret %d
\n
"
,
cm_id
,
event
->
event
,
ret
);
rdsdebug
(
"id %p event %u (%s) handling ret %d
\n
"
,
cm_id
,
event
->
event
,
rds_cm_event_str
(
event
->
event
),
ret
);
return
ret
;
}
static
int
__init
rds_rdma_listen_init
(
void
)
static
int
rds_rdma_listen_init
(
void
)
{
struct
sockaddr_in
sin
;
struct
rdma_cm_id
*
cm_id
;
...
...
@@ -177,7 +207,7 @@ static void rds_rdma_listen_stop(void)
}
}
int
__init
rds_rdma_init
(
void
)
int
rds_rdma_init
(
void
)
{
int
ret
;
...
...
net/rds/rds.h
View file @
cf0ac2b8
...
...
@@ -80,6 +80,7 @@ enum {
/* Bits for c_flags */
#define RDS_LL_SEND_FULL 0
#define RDS_RECONNECT_PENDING 1
#define RDS_IN_XMIT 2
struct
rds_connection
{
struct
hlist_node
c_hash_node
;
...
...
@@ -91,12 +92,13 @@ struct rds_connection {
struct
rds_cong_map
*
c_lcong
;
struct
rds_cong_map
*
c_fcong
;
struct
mutex
c_send_lock
;
/* protect send ring */
struct
rds_message
*
c_xmit_rm
;
unsigned
long
c_xmit_sg
;
unsigned
int
c_xmit_hdr_off
;
unsigned
int
c_xmit_data_off
;
unsigned
int
c_xmit_atomic_sent
;
unsigned
int
c_xmit_rdma_sent
;
unsigned
int
c_xmit_data_sent
;
spinlock_t
c_lock
;
/* protect msg queues */
u64
c_next_tx_seq
;
...
...
@@ -116,11 +118,10 @@ struct rds_connection {
struct
delayed_work
c_conn_w
;
struct
work_struct
c_down_w
;
struct
mutex
c_cm_lock
;
/* protect conn state & cm */
wait_queue_head_t
c_waitq
;
struct
list_head
c_map_item
;
unsigned
long
c_map_queued
;
unsigned
long
c_map_offset
;
unsigned
long
c_map_bytes
;
unsigned
int
c_unacked_packets
;
unsigned
int
c_unacked_bytes
;
...
...
@@ -206,6 +207,48 @@ struct rds_incoming {
rds_rdma_cookie_t
i_rdma_cookie
;
};
struct
rds_mr
{
struct
rb_node
r_rb_node
;
atomic_t
r_refcount
;
u32
r_key
;
/* A copy of the creation flags */
unsigned
int
r_use_once
:
1
;
unsigned
int
r_invalidate
:
1
;
unsigned
int
r_write
:
1
;
/* This is for RDS_MR_DEAD.
* It would be nice & consistent to make this part of the above
* bit field here, but we need to use test_and_set_bit.
*/
unsigned
long
r_state
;
struct
rds_sock
*
r_sock
;
/* back pointer to the socket that owns us */
struct
rds_transport
*
r_trans
;
void
*
r_trans_private
;
};
/* Flags for mr->r_state */
#define RDS_MR_DEAD 0
static
inline
rds_rdma_cookie_t
rds_rdma_make_cookie
(
u32
r_key
,
u32
offset
)
{
return
r_key
|
(((
u64
)
offset
)
<<
32
);
}
static
inline
u32
rds_rdma_cookie_key
(
rds_rdma_cookie_t
cookie
)
{
return
cookie
;
}
static
inline
u32
rds_rdma_cookie_offset
(
rds_rdma_cookie_t
cookie
)
{
return
cookie
>>
32
;
}
/* atomic operation types */
#define RDS_ATOMIC_TYPE_CSWP 0
#define RDS_ATOMIC_TYPE_FADD 1
/*
* m_sock_item and m_conn_item are on lists that are serialized under
* conn->c_lock. m_sock_item has additional meaning in that once it is empty
...
...
@@ -258,13 +301,71 @@ struct rds_message {
* -> rs->rs_lock
*/
spinlock_t
m_rs_lock
;
wait_queue_head_t
m_flush_wait
;
struct
rds_sock
*
m_rs
;
struct
rds_rdma_op
*
m_rdma_op
;
/* cookie to send to remote, in rds header */
rds_rdma_cookie_t
m_rdma_cookie
;
struct
rds_mr
*
m_rdma_mr
;
unsigned
int
m_nents
;
unsigned
int
m_count
;
struct
scatterlist
m_sg
[
0
];
unsigned
int
m_used_sgs
;
unsigned
int
m_total_sgs
;
void
*
m_final_op
;
struct
{
struct
rm_atomic_op
{
int
op_type
;
union
{
struct
{
uint64_t
compare
;
uint64_t
swap
;
uint64_t
compare_mask
;
uint64_t
swap_mask
;
}
op_m_cswp
;
struct
{
uint64_t
add
;
uint64_t
nocarry_mask
;
}
op_m_fadd
;
};
u32
op_rkey
;
u64
op_remote_addr
;
unsigned
int
op_notify
:
1
;
unsigned
int
op_recverr
:
1
;
unsigned
int
op_mapped
:
1
;
unsigned
int
op_silent
:
1
;
unsigned
int
op_active
:
1
;
struct
scatterlist
*
op_sg
;
struct
rds_notifier
*
op_notifier
;
struct
rds_mr
*
op_rdma_mr
;
}
atomic
;
struct
rm_rdma_op
{
u32
op_rkey
;
u64
op_remote_addr
;
unsigned
int
op_write
:
1
;
unsigned
int
op_fence
:
1
;
unsigned
int
op_notify
:
1
;
unsigned
int
op_recverr
:
1
;
unsigned
int
op_mapped
:
1
;
unsigned
int
op_silent
:
1
;
unsigned
int
op_active
:
1
;
unsigned
int
op_bytes
;
unsigned
int
op_nents
;
unsigned
int
op_count
;
struct
scatterlist
*
op_sg
;
struct
rds_notifier
*
op_notifier
;
struct
rds_mr
*
op_rdma_mr
;
}
rdma
;
struct
rm_data_op
{
unsigned
int
op_active
:
1
;
unsigned
int
op_nents
;
unsigned
int
op_count
;
struct
scatterlist
*
op_sg
;
}
data
;
};
};
/*
...
...
@@ -305,10 +406,6 @@ struct rds_notifier {
* transport is responsible for other serialization, including
* rds_recv_incoming(). This is called in process context but
* should try hard not to block.
*
* @xmit_cong_map: This asks the transport to send the local bitmap down the
* given connection. XXX get a better story about the bitmap
* flag and header.
*/
#define RDS_TRANS_IB 0
...
...
@@ -332,13 +429,11 @@ struct rds_transport {
void
(
*
xmit_complete
)(
struct
rds_connection
*
conn
);
int
(
*
xmit
)(
struct
rds_connection
*
conn
,
struct
rds_message
*
rm
,
unsigned
int
hdr_off
,
unsigned
int
sg
,
unsigned
int
off
);
int
(
*
xmit_cong_map
)(
struct
rds_connection
*
conn
,
struct
rds_cong_map
*
map
,
unsigned
long
offset
);
int
(
*
xmit_rdma
)(
struct
rds_connection
*
conn
,
struct
rds_rdma_op
*
op
);
int
(
*
xmit_rdma
)(
struct
rds_connection
*
conn
,
struct
rm_rdma_op
*
op
);
int
(
*
xmit_atomic
)(
struct
rds_connection
*
conn
,
struct
rm_atomic_op
*
op
);
int
(
*
recv
)(
struct
rds_connection
*
conn
);
int
(
*
inc_copy_to_user
)(
struct
rds_incoming
*
inc
,
struct
iovec
*
iov
,
size_t
size
);
void
(
*
inc_purge
)(
struct
rds_incoming
*
inc
);
void
(
*
inc_free
)(
struct
rds_incoming
*
inc
);
int
(
*
cm_handle_connect
)(
struct
rdma_cm_id
*
cm_id
,
...
...
@@ -367,17 +462,11 @@ struct rds_sock {
* bound_addr used for both incoming and outgoing, no INADDR_ANY
* support.
*/
struct
rb_node
rs_bound_node
;
struct
hlist_node
rs_bound_node
;
__be32
rs_bound_addr
;
__be32
rs_conn_addr
;
__be16
rs_bound_port
;
__be16
rs_conn_port
;
/*
* This is only used to communicate the transport between bind and
* initiating connections. All other trans use is referenced through
* the connection.
*/
struct
rds_transport
*
rs_transport
;
/*
...
...
@@ -466,8 +555,8 @@ struct rds_statistics {
uint64_t
s_recv_ping
;
uint64_t
s_send_queue_empty
;
uint64_t
s_send_queue_full
;
uint64_t
s_send_
sem
_contention
;
uint64_t
s_send_
sem
_queue_raced
;
uint64_t
s_send_
lock
_contention
;
uint64_t
s_send_
lock
_queue_raced
;
uint64_t
s_send_immediate_retry
;
uint64_t
s_send_delayed_retry
;
uint64_t
s_send_drop_acked
;
...
...
@@ -487,6 +576,7 @@ struct rds_statistics {
};
/* af_rds.c */
char
*
rds_str_array
(
char
**
array
,
size_t
elements
,
size_t
index
);
void
rds_sock_addref
(
struct
rds_sock
*
rs
);
void
rds_sock_put
(
struct
rds_sock
*
rs
);
void
rds_wake_sk_sleep
(
struct
rds_sock
*
rs
);
...
...
@@ -521,15 +611,17 @@ void rds_cong_exit(void);
struct
rds_message
*
rds_cong_update_alloc
(
struct
rds_connection
*
conn
);
/* conn.c */
int
__init
rds_conn_init
(
void
);
int
rds_conn_init
(
void
);
void
rds_conn_exit
(
void
);
struct
rds_connection
*
rds_conn_create
(
__be32
laddr
,
__be32
faddr
,
struct
rds_transport
*
trans
,
gfp_t
gfp
);
struct
rds_connection
*
rds_conn_create_outgoing
(
__be32
laddr
,
__be32
faddr
,
struct
rds_transport
*
trans
,
gfp_t
gfp
);
void
rds_conn_shutdown
(
struct
rds_connection
*
conn
);
void
rds_conn_destroy
(
struct
rds_connection
*
conn
);
void
rds_conn_reset
(
struct
rds_connection
*
conn
);
void
rds_conn_drop
(
struct
rds_connection
*
conn
);
void
rds_conn_connect_if_down
(
struct
rds_connection
*
conn
);
void
rds_for_each_conn_info
(
struct
socket
*
sock
,
unsigned
int
len
,
struct
rds_info_iterator
*
iter
,
struct
rds_info_lengths
*
lens
,
...
...
@@ -566,7 +658,8 @@ rds_conn_connecting(struct rds_connection *conn)
/* message.c */
struct
rds_message
*
rds_message_alloc
(
unsigned
int
nents
,
gfp_t
gfp
);
struct
rds_message
*
rds_message_copy_from_user
(
struct
iovec
*
first_iov
,
struct
scatterlist
*
rds_message_alloc_sgs
(
struct
rds_message
*
rm
,
int
nents
);
int
rds_message_copy_from_user
(
struct
rds_message
*
rm
,
struct
iovec
*
first_iov
,
size_t
total_len
);
struct
rds_message
*
rds_message_map_pages
(
unsigned
long
*
page_addrs
,
unsigned
int
total_len
);
void
rds_message_populate_header
(
struct
rds_header
*
hdr
,
__be16
sport
,
...
...
@@ -580,7 +673,6 @@ int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *vers
int
rds_message_add_rdma_dest_extension
(
struct
rds_header
*
hdr
,
u32
r_key
,
u32
offset
);
int
rds_message_inc_copy_to_user
(
struct
rds_incoming
*
inc
,
struct
iovec
*
first_iov
,
size_t
size
);
void
rds_message_inc_purge
(
struct
rds_incoming
*
inc
);
void
rds_message_inc_free
(
struct
rds_incoming
*
inc
);
void
rds_message_addref
(
struct
rds_message
*
rm
);
void
rds_message_put
(
struct
rds_message
*
rm
);
...
...
@@ -636,14 +728,39 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
typedef
int
(
*
is_acked_func
)(
struct
rds_message
*
rm
,
uint64_t
ack
);
void
rds_send_drop_acked
(
struct
rds_connection
*
conn
,
u64
ack
,
is_acked_func
is_acked
);
int
rds_send_acked_before
(
struct
rds_connection
*
conn
,
u64
seq
);
void
rds_send_remove_from_sock
(
struct
list_head
*
messages
,
int
status
);
int
rds_send_pong
(
struct
rds_connection
*
conn
,
__be16
dport
);
struct
rds_message
*
rds_send_get_message
(
struct
rds_connection
*
,
struct
r
ds
_rdma_op
*
);
struct
r
m
_rdma_op
*
);
/* rdma.c */
void
rds_rdma_unuse
(
struct
rds_sock
*
rs
,
u32
r_key
,
int
force
);
int
rds_get_mr
(
struct
rds_sock
*
rs
,
char
__user
*
optval
,
int
optlen
);
int
rds_get_mr_for_dest
(
struct
rds_sock
*
rs
,
char
__user
*
optval
,
int
optlen
);
int
rds_free_mr
(
struct
rds_sock
*
rs
,
char
__user
*
optval
,
int
optlen
);
void
rds_rdma_drop_keys
(
struct
rds_sock
*
rs
);
int
rds_rdma_extra_size
(
struct
rds_rdma_args
*
args
);
int
rds_cmsg_rdma_args
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
);
int
rds_cmsg_rdma_dest
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
);
int
rds_cmsg_rdma_args
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
);
int
rds_cmsg_rdma_map
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
);
void
rds_rdma_free_op
(
struct
rm_rdma_op
*
ro
);
void
rds_atomic_free_op
(
struct
rm_atomic_op
*
ao
);
void
rds_rdma_send_complete
(
struct
rds_message
*
rm
,
int
wc_status
);
void
rds_atomic_send_complete
(
struct
rds_message
*
rm
,
int
wc_status
);
int
rds_cmsg_atomic
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
cmsghdr
*
cmsg
);
extern
void
__rds_put_mr_final
(
struct
rds_mr
*
mr
);
static
inline
void
rds_mr_put
(
struct
rds_mr
*
mr
)
{
if
(
atomic_dec_and_test
(
&
mr
->
r_refcount
))
__rds_put_mr_final
(
mr
);
}
/* stats.c */
DECLARE_PER_CPU_SHARED_ALIGNED
(
struct
rds_statistics
,
rds_stats
);
...
...
@@ -657,14 +774,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
put_cpu(); \
} while (0)
#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
int
__init
rds_stats_init
(
void
);
int
rds_stats_init
(
void
);
void
rds_stats_exit
(
void
);
void
rds_stats_info_copy
(
struct
rds_info_iterator
*
iter
,
uint64_t
*
values
,
const
char
*
const
*
names
,
size_t
nr
);
/* sysctl.c */
int
__init
rds_sysctl_init
(
void
);
int
rds_sysctl_init
(
void
);
void
rds_sysctl_exit
(
void
);
extern
unsigned
long
rds_sysctl_sndbuf_min
;
extern
unsigned
long
rds_sysctl_sndbuf_default
;
...
...
@@ -678,9 +795,10 @@ extern unsigned long rds_sysctl_trace_flags;
extern
unsigned
int
rds_sysctl_trace_level
;
/* threads.c */
int
__init
rds_threads_init
(
void
);
int
rds_threads_init
(
void
);
void
rds_threads_exit
(
void
);
extern
struct
workqueue_struct
*
rds_wq
;
void
rds_queue_reconnect
(
struct
rds_connection
*
conn
);
void
rds_connect_worker
(
struct
work_struct
*
);
void
rds_shutdown_worker
(
struct
work_struct
*
);
void
rds_send_worker
(
struct
work_struct
*
);
...
...
@@ -691,9 +809,10 @@ void rds_connect_complete(struct rds_connection *conn);
int
rds_trans_register
(
struct
rds_transport
*
trans
);
void
rds_trans_unregister
(
struct
rds_transport
*
trans
);
struct
rds_transport
*
rds_trans_get_preferred
(
__be32
addr
);
void
rds_trans_put
(
struct
rds_transport
*
trans
);
unsigned
int
rds_trans_stats_info_copy
(
struct
rds_info_iterator
*
iter
,
unsigned
int
avail
);
int
__init
rds_trans_init
(
void
);
int
rds_trans_init
(
void
);
void
rds_trans_exit
(
void
);
#endif
net/rds/recv.c
View file @
cf0ac2b8
...
...
@@ -36,7 +36,6 @@
#include <linux/in.h>
#include "rds.h"
#include "rdma.h"
void
rds_inc_init
(
struct
rds_incoming
*
inc
,
struct
rds_connection
*
conn
,
__be32
saddr
)
...
...
@@ -210,7 +209,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
}
rs
=
rds_find_bound
(
daddr
,
inc
->
i_hdr
.
h_dport
);
if
(
rs
==
NULL
)
{
if
(
!
rs
)
{
rds_stats_inc
(
s_recv_drop_no_sock
);
goto
out
;
}
...
...
@@ -251,7 +250,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
{
unsigned
long
flags
;
if
(
*
inc
==
NULL
)
{
if
(
!*
inc
)
{
read_lock_irqsave
(
&
rs
->
rs_recv_lock
,
flags
);
if
(
!
list_empty
(
&
rs
->
rs_recv_queue
))
{
*
inc
=
list_entry
(
rs
->
rs_recv_queue
.
next
,
...
...
@@ -334,10 +333,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
if
(
msghdr
)
{
cmsg
.
user_token
=
notifier
->
n_user_token
;
cmsg
.
status
=
notifier
->
n_status
;
cmsg
.
status
=
notifier
->
n_status
;
err
=
put_cmsg
(
msghdr
,
SOL_RDS
,
RDS_CMSG_RDMA_STATUS
,
sizeof
(
cmsg
),
&
cmsg
);
sizeof
(
cmsg
),
&
cmsg
);
if
(
err
)
break
;
}
...
...
net/rds/send.c
View file @
cf0ac2b8
...
...
@@ -37,7 +37,6 @@
#include <linux/list.h>
#include "rds.h"
#include "rdma.h"
/* When transmitting messages in rds_send_xmit, we need to emerge from
* time to time and briefly release the CPU. Otherwise the softlock watchdog
...
...
@@ -54,7 +53,8 @@ module_param(send_batch_count, int, 0444);
MODULE_PARM_DESC
(
send_batch_count
,
" batch factor when working the send queue"
);
/*
* Reset the send state. Caller must hold c_send_lock when calling here.
* Reset the send state. Callers must ensure that this doesn't race with
* rds_send_xmit().
*/
void
rds_send_reset
(
struct
rds_connection
*
conn
)
{
...
...
@@ -62,18 +62,22 @@ void rds_send_reset(struct rds_connection *conn)
unsigned
long
flags
;
if
(
conn
->
c_xmit_rm
)
{
rm
=
conn
->
c_xmit_rm
;
conn
->
c_xmit_rm
=
NULL
;
/* Tell the user the RDMA op is no longer mapped by the
* transport. This isn't entirely true (it's flushed out
* independently) but as the connection is down, there's
* no ongoing RDMA to/from that memory */
rds_message_unmapped
(
conn
->
c_xmit_rm
);
rds_message_put
(
conn
->
c_xmit_rm
);
conn
->
c_xmit_rm
=
NULL
;
rds_message_unmapped
(
rm
);
rds_message_put
(
rm
);
}
conn
->
c_xmit_sg
=
0
;
conn
->
c_xmit_hdr_off
=
0
;
conn
->
c_xmit_data_off
=
0
;
conn
->
c_xmit_atomic_sent
=
0
;
conn
->
c_xmit_rdma_sent
=
0
;
conn
->
c_xmit_data_sent
=
0
;
conn
->
c_map_queued
=
0
;
...
...
@@ -90,6 +94,25 @@ void rds_send_reset(struct rds_connection *conn)
spin_unlock_irqrestore
(
&
conn
->
c_lock
,
flags
);
}
static
int
acquire_in_xmit
(
struct
rds_connection
*
conn
)
{
return
test_and_set_bit
(
RDS_IN_XMIT
,
&
conn
->
c_flags
)
==
0
;
}
static
void
release_in_xmit
(
struct
rds_connection
*
conn
)
{
clear_bit
(
RDS_IN_XMIT
,
&
conn
->
c_flags
);
smp_mb__after_clear_bit
();
/*
* We don't use wait_on_bit()/wake_up_bit() because our waking is in a
* hot path and finding waiters is very rare. We don't want to walk
* the system-wide hashed waitqueue buckets in the fast path only to
* almost never find waiters.
*/
if
(
waitqueue_active
(
&
conn
->
c_waitq
))
wake_up_all
(
&
conn
->
c_waitq
);
}
/*
* We're making the concious trade-off here to only send one message
* down the connection at a time.
...
...
@@ -109,102 +132,69 @@ int rds_send_xmit(struct rds_connection *conn)
struct
rds_message
*
rm
;
unsigned
long
flags
;
unsigned
int
tmp
;
unsigned
int
send_quota
=
send_batch_count
;
struct
scatterlist
*
sg
;
int
ret
=
0
;
int
was_empty
=
0
;
LIST_HEAD
(
to_be_dropped
);
restart:
/*
* sendmsg calls here after having queued its message on the send
* queue. We only have one task feeding the connection at a time. If
* another thread is already feeding the queue then we back off. This
* avoids blocking the caller and trading per-connection data between
* caches per message.
*
* The sem holder will issue a retry if they notice that someone queued
* a message after they stopped walking the send queue but before they
* dropped the sem.
*/
if
(
!
mutex_trylock
(
&
conn
->
c_send_lock
))
{
rds_stats_inc
(
s_send_
sem
_contention
);
if
(
!
acquire_in_xmit
(
conn
))
{
rds_stats_inc
(
s_send_
lock
_contention
);
ret
=
-
ENOMEM
;
goto
out
;
}
/*
* rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
* we do the opposite to avoid races.
*/
if
(
!
rds_conn_up
(
conn
))
{
release_in_xmit
(
conn
);
ret
=
0
;
goto
out
;
}
if
(
conn
->
c_trans
->
xmit_prepare
)
conn
->
c_trans
->
xmit_prepare
(
conn
);
/*
* spin trying to push headers and data down the connection until
* the connection doe
ns
't make forward progress.
* the connection doe
sn
't make forward progress.
*/
while
(
--
send_quota
)
{
/*
* See if need to send a congestion map update if we're
* between sending messages. The send_sem protects our sole
* use of c_map_offset and _bytes.
* Note this is used only by transports that define a special
* xmit_cong_map function. For all others, we create allocate
* a cong_map message and treat it just like any other send.
*/
if
(
conn
->
c_map_bytes
)
{
ret
=
conn
->
c_trans
->
xmit_cong_map
(
conn
,
conn
->
c_lcong
,
conn
->
c_map_offset
);
if
(
ret
<=
0
)
break
;
while
(
1
)
{
conn
->
c_map_offset
+=
ret
;
conn
->
c_map_bytes
-=
ret
;
if
(
conn
->
c_map_bytes
)
continue
;
}
/* If we're done sending the current message, clear the
* offset and S/G temporaries.
*/
rm
=
conn
->
c_xmit_rm
;
if
(
rm
!=
NULL
&&
conn
->
c_xmit_hdr_off
==
sizeof
(
struct
rds_header
)
&&
conn
->
c_xmit_sg
==
rm
->
m_nents
)
{
conn
->
c_xmit_rm
=
NULL
;
conn
->
c_xmit_sg
=
0
;
conn
->
c_xmit_hdr_off
=
0
;
conn
->
c_xmit_data_off
=
0
;
conn
->
c_xmit_rdma_sent
=
0
;
/* Release the reference to the previous message. */
rds_message_put
(
rm
);
rm
=
NULL
;
}
/* If we're asked to send a cong map update, do so.
/*
* If between sending messages, we can send a pending congestion
* map update.
*/
if
(
rm
==
NULL
&&
test_and_clear_bit
(
0
,
&
conn
->
c_map_queued
))
{
if
(
conn
->
c_trans
->
xmit_cong_map
!=
NULL
)
{
conn
->
c_map_offset
=
0
;
conn
->
c_map_bytes
=
sizeof
(
struct
rds_header
)
+
RDS_CONG_MAP_BYTES
;
continue
;
}
if
(
!
rm
&&
test_and_clear_bit
(
0
,
&
conn
->
c_map_queued
))
{
rm
=
rds_cong_update_alloc
(
conn
);
if
(
IS_ERR
(
rm
))
{
ret
=
PTR_ERR
(
rm
);
break
;
}
rm
->
data
.
op_active
=
1
;
conn
->
c_xmit_rm
=
rm
;
}
/*
*
Grab the next message from the send queue, if there is on
e.
*
If not already working on one, grab the next messag
e.
*
* c_xmit_rm holds a ref while we're sending this message down
* the connction. We can use this ref while holding the
* send_sem.. rds_send_reset() is serialized with it.
*/
if
(
rm
==
NULL
)
{
if
(
!
rm
)
{
unsigned
int
len
;
spin_lock_irqsave
(
&
conn
->
c_lock
,
flags
);
...
...
@@ -224,10 +214,8 @@ int rds_send_xmit(struct rds_connection *conn)
spin_unlock_irqrestore
(
&
conn
->
c_lock
,
flags
);
if
(
rm
==
NULL
)
{
was_empty
=
1
;
if
(
!
rm
)
break
;
}
/* Unfortunately, the way Infiniband deals with
* RDMA to a bad MR key is by moving the entire
...
...
@@ -236,13 +224,12 @@ int rds_send_xmit(struct rds_connection *conn)
* connection.
* Therefore, we never retransmit messages with RDMA ops.
*/
if
(
rm
->
m_rdma_op
&&
if
(
rm
->
rdma
.
op_active
&&
test_bit
(
RDS_MSG_RETRANSMITTED
,
&
rm
->
m_flags
))
{
spin_lock_irqsave
(
&
conn
->
c_lock
,
flags
);
if
(
test_and_clear_bit
(
RDS_MSG_ON_CONN
,
&
rm
->
m_flags
))
list_move
(
&
rm
->
m_conn_item
,
&
to_be_dropped
);
spin_unlock_irqrestore
(
&
conn
->
c_lock
,
flags
);
rds_message_put
(
rm
);
continue
;
}
...
...
@@ -263,23 +250,55 @@ int rds_send_xmit(struct rds_connection *conn)
conn
->
c_xmit_rm
=
rm
;
}
/*
* Try and send an rdma message. Let's see if we can
* keep this simple and require that the transport either
* send the whole rdma or none of it.
*/
if
(
rm
->
m_rdma_op
&&
!
conn
->
c_xmit_rdma_sent
)
{
ret
=
conn
->
c_trans
->
xmit_rdma
(
conn
,
rm
->
m_rdma_op
);
/* The transport either sends the whole rdma or none of it */
if
(
rm
->
rdma
.
op_active
&&
!
conn
->
c_xmit_rdma_sent
)
{
rm
->
m_final_op
=
&
rm
->
rdma
;
ret
=
conn
->
c_trans
->
xmit_rdma
(
conn
,
&
rm
->
rdma
);
if
(
ret
)
break
;
conn
->
c_xmit_rdma_sent
=
1
;
/* The transport owns the mapped memory for now.
* You can't unmap it while it's on the send queue */
set_bit
(
RDS_MSG_MAPPED
,
&
rm
->
m_flags
);
}
if
(
conn
->
c_xmit_hdr_off
<
sizeof
(
struct
rds_header
)
||
conn
->
c_xmit_sg
<
rm
->
m_nents
)
{
if
(
rm
->
atomic
.
op_active
&&
!
conn
->
c_xmit_atomic_sent
)
{
rm
->
m_final_op
=
&
rm
->
atomic
;
ret
=
conn
->
c_trans
->
xmit_atomic
(
conn
,
&
rm
->
atomic
);
if
(
ret
)
break
;
conn
->
c_xmit_atomic_sent
=
1
;
/* The transport owns the mapped memory for now.
* You can't unmap it while it's on the send queue */
set_bit
(
RDS_MSG_MAPPED
,
&
rm
->
m_flags
);
}
/*
* A number of cases require an RDS header to be sent
* even if there is no data.
* We permit 0-byte sends; rds-ping depends on this.
* However, if there are exclusively attached silent ops,
* we skip the hdr/data send, to enable silent operation.
*/
if
(
rm
->
data
.
op_nents
==
0
)
{
int
ops_present
;
int
all_ops_are_silent
=
1
;
ops_present
=
(
rm
->
atomic
.
op_active
||
rm
->
rdma
.
op_active
);
if
(
rm
->
atomic
.
op_active
&&
!
rm
->
atomic
.
op_silent
)
all_ops_are_silent
=
0
;
if
(
rm
->
rdma
.
op_active
&&
!
rm
->
rdma
.
op_silent
)
all_ops_are_silent
=
0
;
if
(
ops_present
&&
all_ops_are_silent
&&
!
rm
->
m_rdma_cookie
)
rm
->
data
.
op_active
=
0
;
}
if
(
rm
->
data
.
op_active
&&
!
conn
->
c_xmit_data_sent
)
{
rm
->
m_final_op
=
&
rm
->
data
;
ret
=
conn
->
c_trans
->
xmit
(
conn
,
rm
,
conn
->
c_xmit_hdr_off
,
conn
->
c_xmit_sg
,
...
...
@@ -295,7 +314,7 @@ int rds_send_xmit(struct rds_connection *conn)
ret
-=
tmp
;
}
sg
=
&
rm
->
m
_sg
[
conn
->
c_xmit_sg
];
sg
=
&
rm
->
data
.
op
_sg
[
conn
->
c_xmit_sg
];
while
(
ret
)
{
tmp
=
min_t
(
int
,
ret
,
sg
->
length
-
conn
->
c_xmit_data_off
);
...
...
@@ -306,49 +325,63 @@ int rds_send_xmit(struct rds_connection *conn)
sg
++
;
conn
->
c_xmit_sg
++
;
BUG_ON
(
ret
!=
0
&&
conn
->
c_xmit_sg
==
rm
->
m
_nents
);
conn
->
c_xmit_sg
==
rm
->
data
.
op
_nents
);
}
}
if
(
conn
->
c_xmit_hdr_off
==
sizeof
(
struct
rds_header
)
&&
(
conn
->
c_xmit_sg
==
rm
->
data
.
op_nents
))
conn
->
c_xmit_data_sent
=
1
;
}
}
/* Nuke any messages we decided not to retransmit. */
if
(
!
list_empty
(
&
to_be_dropped
))
rds_send_remove_from_sock
(
&
to_be_dropped
,
RDS_RDMA_DROPPED
);
/*
* A rm will only take multiple times through this loop
* if there is a data op. Thus, if the data is sent (or there was
* none), then we're done with the rm.
*/
if
(
!
rm
->
data
.
op_active
||
conn
->
c_xmit_data_sent
)
{
conn
->
c_xmit_rm
=
NULL
;
conn
->
c_xmit_sg
=
0
;
conn
->
c_xmit_hdr_off
=
0
;
conn
->
c_xmit_data_off
=
0
;
conn
->
c_xmit_rdma_sent
=
0
;
conn
->
c_xmit_atomic_sent
=
0
;
conn
->
c_xmit_data_sent
=
0
;
rds_message_put
(
rm
);
}
}
if
(
conn
->
c_trans
->
xmit_complete
)
conn
->
c_trans
->
xmit_complete
(
conn
);
/*
* We might be racing with another sender who queued a message but
* backed off on noticing that we held the c_send_lock. If we check
* for queued messages after dropping the sem then either we'll
* see the queued message or the queuer will get the sem. If we
* notice the queued message then we trigger an immediate retry.
*
* We need to be careful only to do this when we stopped processing
* the send queue because it was empty. It's the only way we
* stop processing the loop when the transport hasn't taken
* responsibility for forward progress.
*/
mutex_unlock
(
&
conn
->
c_send_lock
);
release_in_xmit
(
conn
);
if
(
conn
->
c_map_bytes
||
(
send_quota
==
0
&&
!
was_empty
))
{
/* We exhausted the send quota, but there's work left to
* do. Return and (re-)schedule the send worker.
*/
ret
=
-
EAGAIN
;
/* Nuke any messages we decided not to retransmit. */
if
(
!
list_empty
(
&
to_be_dropped
))
{
/* irqs on here, so we can put(), unlike above */
list_for_each_entry
(
rm
,
&
to_be_dropped
,
m_conn_item
)
rds_message_put
(
rm
);
rds_send_remove_from_sock
(
&
to_be_dropped
,
RDS_RDMA_DROPPED
);
}
if
(
ret
==
0
&&
was_empty
)
{
/* A simple bit test would be way faster than taking the
* spin lock */
spin_lock_irqsave
(
&
conn
->
c_lock
,
flags
);
/*
* Other senders can queue a message after we last test the send queue
* but before we clear RDS_IN_XMIT. In that case they'd back off and
* not try and send their newly queued message. We need to check the
* send queue after having cleared RDS_IN_XMIT so that their message
* doesn't get stuck on the send queue.
*
* If the transport cannot continue (i.e ret != 0), then it must
* call us when more room is available, such as from the tx
* completion handler.
*/
if
(
ret
==
0
)
{
smp_mb
();
if
(
!
list_empty
(
&
conn
->
c_send_queue
))
{
rds_stats_inc
(
s_send_
sem
_queue_raced
);
ret
=
-
EAGAIN
;
rds_stats_inc
(
s_send_
lock
_queue_raced
);
goto
restart
;
}
spin_unlock_irqrestore
(
&
conn
->
c_lock
,
flags
);
}
out:
return
ret
;
...
...
@@ -376,52 +409,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
}
/*
* Returns true if there are no messages on the send and retransmit queues
* which have a sequence number greater than or equal to the given sequence
* number.
* This is pretty similar to what happens below in the ACK
* handling code - except that we call here as soon as we get
* the IB send completion on the RDMA op and the accompanying
* message.
*/
int
rds_send_acked_before
(
struct
rds_connection
*
conn
,
u64
seq
)
void
rds_rdma_send_complete
(
struct
rds_message
*
rm
,
int
status
)
{
struct
rds_message
*
rm
,
*
tmp
;
int
ret
=
1
;
struct
rds_sock
*
rs
=
NULL
;
struct
rm_rdma_op
*
ro
;
struct
rds_notifier
*
notifier
;
unsigned
long
flags
;
spin_lock
(
&
conn
->
c_lock
);
spin_lock
_irqsave
(
&
rm
->
m_rs_lock
,
flags
);
list_for_each_entry_safe
(
rm
,
tmp
,
&
conn
->
c_retrans
,
m_conn_item
)
{
if
(
be64_to_cpu
(
rm
->
m_inc
.
i_hdr
.
h_sequence
)
<
seq
)
ret
=
0
;
break
;
}
ro
=
&
rm
->
rdma
;
if
(
test_bit
(
RDS_MSG_ON_SOCK
,
&
rm
->
m_flags
)
&&
ro
->
op_active
&&
ro
->
op_notify
&&
ro
->
op_notifier
)
{
notifier
=
ro
->
op_notifier
;
rs
=
rm
->
m_rs
;
sock_hold
(
rds_rs_to_sk
(
rs
));
list_for_each_entry_safe
(
rm
,
tmp
,
&
conn
->
c_send_queue
,
m_conn_item
)
{
if
(
be64_to_cpu
(
rm
->
m_inc
.
i_hdr
.
h_sequence
)
<
seq
)
ret
=
0
;
break
;
notifier
->
n_status
=
status
;
spin_lock
(
&
rs
->
rs_lock
);
list_add_tail
(
&
notifier
->
n_list
,
&
rs
->
rs_notify_queue
);
spin_unlock
(
&
rs
->
rs_lock
);
ro
->
op_notifier
=
NULL
;
}
spin_unlock
(
&
conn
->
c_lock
);
spin_unlock
_irqrestore
(
&
rm
->
m_rs_lock
,
flags
);
return
ret
;
if
(
rs
)
{
rds_wake_sk_sleep
(
rs
);
sock_put
(
rds_rs_to_sk
(
rs
));
}
}
EXPORT_SYMBOL_GPL
(
rds_rdma_send_complete
);
/*
* This is pretty similar to what happens below in the ACK
* handling code - except that we call here as soon as we get
* the IB send completion on the RDMA op and the accompanying
* message.
* Just like above, except looks at atomic op
*/
void
rds_
rdma
_send_complete
(
struct
rds_message
*
rm
,
int
status
)
void
rds_
atomic
_send_complete
(
struct
rds_message
*
rm
,
int
status
)
{
struct
rds_sock
*
rs
=
NULL
;
struct
r
ds_rdma_op
*
r
o
;
struct
r
m_atomic_op
*
a
o
;
struct
rds_notifier
*
notifier
;
unsigned
long
flags
;
spin_lock
(
&
rm
->
m_rs_lock
);
spin_lock
_irqsave
(
&
rm
->
m_rs_lock
,
flags
);
ro
=
rm
->
m_rdma_op
;
if
(
test_bit
(
RDS_MSG_ON_SOCK
,
&
rm
->
m_flags
)
&&
ro
&&
ro
->
r_notify
&&
ro
->
r
_notifier
)
{
notifier
=
ro
->
r
_notifier
;
ao
=
&
rm
->
atomic
;
if
(
test_bit
(
RDS_MSG_ON_SOCK
,
&
rm
->
m_flags
)
&&
ao
->
op_active
&&
ao
->
op_notify
&&
ao
->
op
_notifier
)
{
notifier
=
ao
->
op
_notifier
;
rs
=
rm
->
m_rs
;
sock_hold
(
rds_rs_to_sk
(
rs
));
...
...
@@ -430,17 +471,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status)
list_add_tail
(
&
notifier
->
n_list
,
&
rs
->
rs_notify_queue
);
spin_unlock
(
&
rs
->
rs_lock
);
ro
->
r
_notifier
=
NULL
;
ao
->
op
_notifier
=
NULL
;
}
spin_unlock
(
&
rm
->
m_rs_lock
);
spin_unlock
_irqrestore
(
&
rm
->
m_rs_lock
,
flags
);
if
(
rs
)
{
rds_wake_sk_sleep
(
rs
);
sock_put
(
rds_rs_to_sk
(
rs
));
}
}
EXPORT_SYMBOL_GPL
(
rds_
rdma
_send_complete
);
EXPORT_SYMBOL_GPL
(
rds_
atomic
_send_complete
);
/*
* This is the same as rds_rdma_send_complete except we
...
...
@@ -448,15 +489,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
* socket, socket lock) and can just move the notifier.
*/
static
inline
void
__rds_
rdma_
send_complete
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
int
status
)
__rds_send_complete
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
int
status
)
{
struct
rds_rdma_op
*
ro
;
struct
rm_rdma_op
*
ro
;
struct
rm_atomic_op
*
ao
;
ro
=
&
rm
->
rdma
;
if
(
ro
->
op_active
&&
ro
->
op_notify
&&
ro
->
op_notifier
)
{
ro
->
op_notifier
->
n_status
=
status
;
list_add_tail
(
&
ro
->
op_notifier
->
n_list
,
&
rs
->
rs_notify_queue
);
ro
->
op_notifier
=
NULL
;
}
ro
=
rm
->
m_rdma_op
;
if
(
ro
&&
ro
->
r_notify
&&
ro
->
r
_notifier
)
{
ro
->
r
_notifier
->
n_status
=
status
;
list_add_tail
(
&
ro
->
r
_notifier
->
n_list
,
&
rs
->
rs_notify_queue
);
ro
->
r
_notifier
=
NULL
;
ao
=
&
rm
->
atomic
;
if
(
ao
->
op_active
&&
ao
->
op_notify
&&
ao
->
op
_notifier
)
{
ao
->
op
_notifier
->
n_status
=
status
;
list_add_tail
(
&
ao
->
op
_notifier
->
n_list
,
&
rs
->
rs_notify_queue
);
ao
->
op
_notifier
=
NULL
;
}
/* No need to wake the app - caller does this */
...
...
@@ -468,7 +517,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status
* So speed is not an issue here.
*/
struct
rds_message
*
rds_send_get_message
(
struct
rds_connection
*
conn
,
struct
r
ds
_rdma_op
*
op
)
struct
r
m
_rdma_op
*
op
)
{
struct
rds_message
*
rm
,
*
tmp
,
*
found
=
NULL
;
unsigned
long
flags
;
...
...
@@ -476,7 +525,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
spin_lock_irqsave
(
&
conn
->
c_lock
,
flags
);
list_for_each_entry_safe
(
rm
,
tmp
,
&
conn
->
c_retrans
,
m_conn_item
)
{
if
(
rm
->
m_rdma_op
==
op
)
{
if
(
&
rm
->
rdma
==
op
)
{
atomic_inc
(
&
rm
->
m_refcount
);
found
=
rm
;
goto
out
;
...
...
@@ -484,7 +533,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
}
list_for_each_entry_safe
(
rm
,
tmp
,
&
conn
->
c_send_queue
,
m_conn_item
)
{
if
(
rm
->
m_rdma_op
==
op
)
{
if
(
&
rm
->
rdma
==
op
)
{
atomic_inc
(
&
rm
->
m_refcount
);
found
=
rm
;
break
;
...
...
@@ -544,19 +593,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status)
spin_lock
(
&
rs
->
rs_lock
);
if
(
test_and_clear_bit
(
RDS_MSG_ON_SOCK
,
&
rm
->
m_flags
))
{
struct
r
ds_rdma_op
*
ro
=
rm
->
m_rdma_op
;
struct
r
m_rdma_op
*
ro
=
&
rm
->
rdma
;
struct
rds_notifier
*
notifier
;
list_del_init
(
&
rm
->
m_sock_item
);
rds_send_sndbuf_remove
(
rs
,
rm
);
if
(
ro
&&
ro
->
r_notifier
&&
(
status
||
ro
->
r_notify
))
{
notifier
=
ro
->
r_notifier
;
if
(
ro
->
op_active
&&
ro
->
op_notifier
&&
(
ro
->
op_notify
||
(
ro
->
op_recverr
&&
status
)))
{
notifier
=
ro
->
op_notifier
;
list_add_tail
(
&
notifier
->
n_list
,
&
rs
->
rs_notify_queue
);
if
(
!
notifier
->
n_status
)
notifier
->
n_status
=
status
;
rm
->
m_rdma_op
->
r
_notifier
=
NULL
;
rm
->
rdma
.
op
_notifier
=
NULL
;
}
was_on_sock
=
1
;
rm
->
m_rs
=
NULL
;
...
...
@@ -619,9 +669,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
{
struct
rds_message
*
rm
,
*
tmp
;
struct
rds_connection
*
conn
;
unsigned
long
flags
,
flags2
;
unsigned
long
flags
;
LIST_HEAD
(
list
);
int
wake
=
0
;
/* get all the messages we're dropping under the rs lock */
spin_lock_irqsave
(
&
rs
->
rs_lock
,
flags
);
...
...
@@ -631,59 +680,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
dest
->
sin_port
!=
rm
->
m_inc
.
i_hdr
.
h_dport
))
continue
;
wake
=
1
;
list_move
(
&
rm
->
m_sock_item
,
&
list
);
rds_send_sndbuf_remove
(
rs
,
rm
);
clear_bit
(
RDS_MSG_ON_SOCK
,
&
rm
->
m_flags
);
}
/* order flag updates with the rs lock */
if
(
wake
)
smp_mb__after_clear_bit
();
smp_mb__after_clear_bit
();
spin_unlock_irqrestore
(
&
rs
->
rs_lock
,
flags
);
conn
=
NULL
;
if
(
list_empty
(
&
list
))
return
;
/*
now remove the messages from the conn list as needed
*/
/*
Remove the messages from the conn
*/
list_for_each_entry
(
rm
,
&
list
,
m_sock_item
)
{
/* We do this here rather than in the loop above, so that
* we don't have to nest m_rs_lock under rs->rs_lock */
spin_lock_irqsave
(
&
rm
->
m_rs_lock
,
flags2
);
/* If this is a RDMA operation, notify the app. */
spin_lock
(
&
rs
->
rs_lock
);
__rds_rdma_send_complete
(
rs
,
rm
,
RDS_RDMA_CANCELED
);
spin_unlock
(
&
rs
->
rs_lock
);
rm
->
m_rs
=
NULL
;
spin_unlock_irqrestore
(
&
rm
->
m_rs_lock
,
flags2
);
conn
=
rm
->
m_inc
.
i_conn
;
spin_lock_irqsave
(
&
conn
->
c_lock
,
flags
);
/*
* If we see this flag cleared then we're *sure* that someone
* else beat us to removing it from the conn. If we race
* with their flag update we'll get the lock and then really
* see that the flag has been cleared.
* Maybe someone else beat us to removing rm from the conn.
* If we race with their flag update we'll get the lock and
* then really see that the flag has been cleared.
*/
if
(
!
test_bit
(
RDS_MSG_ON_CONN
,
&
rm
->
m_flags
))
if
(
!
test_and_clear_bit
(
RDS_MSG_ON_CONN
,
&
rm
->
m_flags
))
{
spin_unlock_irqrestore
(
&
conn
->
c_lock
,
flags
);
continue
;
if
(
conn
!=
rm
->
m_inc
.
i_conn
)
{
if
(
conn
)
spin_unlock_irqrestore
(
&
conn
->
c_lock
,
flags
);
conn
=
rm
->
m_inc
.
i_conn
;
spin_lock_irqsave
(
&
conn
->
c_lock
,
flags
);
}
list_del_init
(
&
rm
->
m_conn_item
);
spin_unlock_irqrestore
(
&
conn
->
c_lock
,
flags
);
if
(
test_and_clear_bit
(
RDS_MSG_ON_CONN
,
&
rm
->
m_flags
))
{
list_del_init
(
&
rm
->
m_conn_item
);
rds_message_put
(
rm
);
}
}
/*
* Couldn't grab m_rs_lock in top loop (lock ordering),
* but we can now.
*/
spin_lock_irqsave
(
&
rm
->
m_rs_lock
,
flags
);
if
(
conn
)
spin_unlock_irqrestore
(
&
conn
->
c_lock
,
flags
);
spin_lock
(
&
rs
->
rs_lock
);
__rds_send_complete
(
rs
,
rm
,
RDS_RDMA_CANCELED
);
spin_unlock
(
&
rs
->
rs_lock
);
if
(
wake
)
rds_wake_sk_sleep
(
rs
);
rm
->
m_rs
=
NULL
;
spin_unlock_irqrestore
(
&
rm
->
m_rs_lock
,
flags
);
rds_message_put
(
rm
);
}
rds_wake_sk_sleep
(
rs
);
while
(
!
list_empty
(
&
list
))
{
rm
=
list_entry
(
list
.
next
,
struct
rds_message
,
m_sock_item
);
...
...
@@ -763,6 +807,63 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
return
*
queued
;
}
/*
* rds_message is getting to be quite complicated, and we'd like to allocate
* it all in one go. This figures out how big it needs to be up front.
*/
static
int
rds_rm_size
(
struct
msghdr
*
msg
,
int
data_len
)
{
struct
cmsghdr
*
cmsg
;
int
size
=
0
;
int
cmsg_groups
=
0
;
int
retval
;
for
(
cmsg
=
CMSG_FIRSTHDR
(
msg
);
cmsg
;
cmsg
=
CMSG_NXTHDR
(
msg
,
cmsg
))
{
if
(
!
CMSG_OK
(
msg
,
cmsg
))
return
-
EINVAL
;
if
(
cmsg
->
cmsg_level
!=
SOL_RDS
)
continue
;
switch
(
cmsg
->
cmsg_type
)
{
case
RDS_CMSG_RDMA_ARGS
:
cmsg_groups
|=
1
;
retval
=
rds_rdma_extra_size
(
CMSG_DATA
(
cmsg
));
if
(
retval
<
0
)
return
retval
;
size
+=
retval
;
break
;
case
RDS_CMSG_RDMA_DEST
:
case
RDS_CMSG_RDMA_MAP
:
cmsg_groups
|=
2
;
/* these are valid but do no add any size */
break
;
case
RDS_CMSG_ATOMIC_CSWP
:
case
RDS_CMSG_ATOMIC_FADD
:
case
RDS_CMSG_MASKED_ATOMIC_CSWP
:
case
RDS_CMSG_MASKED_ATOMIC_FADD
:
cmsg_groups
|=
1
;
size
+=
sizeof
(
struct
scatterlist
);
break
;
default:
return
-
EINVAL
;
}
}
size
+=
ceil
(
data_len
,
PAGE_SIZE
)
*
sizeof
(
struct
scatterlist
);
/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
if
(
cmsg_groups
==
3
)
return
-
EINVAL
;
return
size
;
}
static
int
rds_cmsg_send
(
struct
rds_sock
*
rs
,
struct
rds_message
*
rm
,
struct
msghdr
*
msg
,
int
*
allocated_mr
)
{
...
...
@@ -777,7 +878,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
continue
;
/* As a side effect, RDMA_DEST and RDMA_MAP will set
* rm->
m_rdma_cookie and rm->
m_rdma_mr.
* rm->
rdma.m_rdma_cookie and rm->rdma.
m_rdma_mr.
*/
switch
(
cmsg
->
cmsg_type
)
{
case
RDS_CMSG_RDMA_ARGS
:
...
...
@@ -793,6 +894,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
if
(
!
ret
)
*
allocated_mr
=
1
;
break
;
case
RDS_CMSG_ATOMIC_CSWP
:
case
RDS_CMSG_ATOMIC_FADD
:
case
RDS_CMSG_MASKED_ATOMIC_CSWP
:
case
RDS_CMSG_MASKED_ATOMIC_FADD
:
ret
=
rds_cmsg_atomic
(
rs
,
rm
,
cmsg
);
break
;
default:
return
-
EINVAL
;
...
...
@@ -850,13 +957,26 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
goto
out
;
}
rm
=
rds_message_copy_from_user
(
msg
->
msg_iov
,
payload_len
);
if
(
IS_ERR
(
rm
))
{
ret
=
PTR_ERR
(
rm
);
rm
=
NULL
;
/* size of rm including all sgs */
ret
=
rds_rm_size
(
msg
,
payload_len
);
if
(
ret
<
0
)
goto
out
;
rm
=
rds_message_alloc
(
ret
,
GFP_KERNEL
);
if
(
!
rm
)
{
ret
=
-
ENOMEM
;
goto
out
;
}
/* Attach data to the rm */
if
(
payload_len
)
{
rm
->
data
.
op_sg
=
rds_message_alloc_sgs
(
rm
,
ceil
(
payload_len
,
PAGE_SIZE
));
ret
=
rds_message_copy_from_user
(
rm
,
msg
->
msg_iov
,
payload_len
);
if
(
ret
)
goto
out
;
}
rm
->
data
.
op_active
=
1
;
rm
->
m_daddr
=
daddr
;
/* rds_conn_create has a spinlock that runs with IRQ off.
...
...
@@ -879,22 +999,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
if
(
ret
)
goto
out
;
if
((
rm
->
m_rdma_cookie
||
rm
->
m_rdma_op
)
&&
conn
->
c_trans
->
xmit_rdma
==
NULL
)
{
if
(
rm
->
rdma
.
op_active
&&
!
conn
->
c_trans
->
xmit_rdma
)
{
if
(
printk_ratelimit
())
printk
(
KERN_NOTICE
"rdma_op %p conn xmit_rdma %p
\n
"
,
rm
->
m_rdma_op
,
conn
->
c_trans
->
xmit_rdma
);
&
rm
->
rdma
,
conn
->
c_trans
->
xmit_rdma
);
ret
=
-
EOPNOTSUPP
;
goto
out
;
}
/* If the connection is down, trigger a connect. We may
* have scheduled a delayed reconnect however - in this case
* we should not interfere.
*/
if
(
rds_conn_state
(
conn
)
==
RDS_CONN_DOWN
&&
!
test_and_set_bit
(
RDS_RECONNECT_PENDING
,
&
conn
->
c_flags
))
queue_delayed_work
(
rds_wq
,
&
conn
->
c_conn_w
,
0
);
if
(
rm
->
atomic
.
op_active
&&
!
conn
->
c_trans
->
xmit_atomic
)
{
if
(
printk_ratelimit
())
printk
(
KERN_NOTICE
"atomic_op %p conn xmit_atomic %p
\n
"
,
&
rm
->
atomic
,
conn
->
c_trans
->
xmit_atomic
);
ret
=
-
EOPNOTSUPP
;
goto
out
;
}
rds_conn_connect_if_down
(
conn
);
ret
=
rds_cong_wait
(
conn
->
c_fcong
,
dport
,
nonblock
,
rs
);
if
(
ret
)
{
...
...
@@ -938,7 +1059,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
rds_stats_inc
(
s_send_queued
);
if
(
!
test_bit
(
RDS_LL_SEND_FULL
,
&
conn
->
c_flags
))
rds_send_
worker
(
&
conn
->
c_send_w
.
work
);
rds_send_
xmit
(
conn
);
rds_message_put
(
rm
);
return
payload_len
;
...
...
@@ -966,20 +1087,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
int
ret
=
0
;
rm
=
rds_message_alloc
(
0
,
GFP_ATOMIC
);
if
(
rm
==
NULL
)
{
if
(
!
rm
)
{
ret
=
-
ENOMEM
;
goto
out
;
}
rm
->
m_daddr
=
conn
->
c_faddr
;
rm
->
data
.
op_active
=
1
;
/* If the connection is down, trigger a connect. We may
* have scheduled a delayed reconnect however - in this case
* we should not interfere.
*/
if
(
rds_conn_state
(
conn
)
==
RDS_CONN_DOWN
&&
!
test_and_set_bit
(
RDS_RECONNECT_PENDING
,
&
conn
->
c_flags
))
queue_delayed_work
(
rds_wq
,
&
conn
->
c_conn_w
,
0
);
rds_conn_connect_if_down
(
conn
);
ret
=
rds_cong_wait
(
conn
->
c_fcong
,
dport
,
1
,
NULL
);
if
(
ret
)
...
...
@@ -999,7 +1115,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
rds_stats_inc
(
s_send_queued
);
rds_stats_inc
(
s_send_pong
);
queue_delayed_work
(
rds_wq
,
&
conn
->
c_send_w
,
0
);
if
(
!
test_bit
(
RDS_LL_SEND_FULL
,
&
conn
->
c_flags
))
rds_send_xmit
(
conn
);
rds_message_put
(
rm
);
return
0
;
...
...
net/rds/stats.c
View file @
cf0ac2b8
...
...
@@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = {
"recv_ping"
,
"send_queue_empty"
,
"send_queue_full"
,
"send_
sem
_contention"
,
"send_
sem
_queue_raced"
,
"send_
lock
_contention"
,
"send_
lock
_queue_raced"
,
"send_immediate_retry"
,
"send_delayed_retry"
,
"send_drop_acked"
,
...
...
@@ -143,7 +143,7 @@ void rds_stats_exit(void)
rds_info_deregister_func
(
RDS_INFO_COUNTERS
,
rds_stats_info
);
}
int
__init
rds_stats_init
(
void
)
int
rds_stats_init
(
void
)
{
rds_info_register_func
(
RDS_INFO_COUNTERS
,
rds_stats_info
);
return
0
;
...
...
net/rds/sysctl.c
View file @
cf0ac2b8
...
...
@@ -105,13 +105,13 @@ void rds_sysctl_exit(void)
unregister_sysctl_table
(
rds_sysctl_reg_table
);
}
int
__init
rds_sysctl_init
(
void
)
int
rds_sysctl_init
(
void
)
{
rds_sysctl_reconnect_min
=
msecs_to_jiffies
(
1
);
rds_sysctl_reconnect_min_jiffies
=
rds_sysctl_reconnect_min
;
rds_sysctl_reg_table
=
register_sysctl_paths
(
rds_sysctl_path
,
rds_sysctl_rds_table
);
if
(
rds_sysctl_reg_table
==
NULL
)
if
(
!
rds_sysctl_reg_table
)
return
-
ENOMEM
;
return
0
;
}
net/rds/tcp.c
View file @
cf0ac2b8
...
...
@@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
struct
rds_tcp_connection
*
tc
;
tc
=
kmem_cache_alloc
(
rds_tcp_conn_slab
,
gfp
);
if
(
tc
==
NULL
)
if
(
!
tc
)
return
-
ENOMEM
;
tc
->
t_sock
=
NULL
;
...
...
@@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = {
.
laddr_check
=
rds_tcp_laddr_check
,
.
xmit_prepare
=
rds_tcp_xmit_prepare
,
.
xmit_complete
=
rds_tcp_xmit_complete
,
.
xmit_cong_map
=
rds_tcp_xmit_cong_map
,
.
xmit
=
rds_tcp_xmit
,
.
recv
=
rds_tcp_recv
,
.
conn_alloc
=
rds_tcp_conn_alloc
,
...
...
@@ -266,7 +265,6 @@ struct rds_transport rds_tcp_transport = {
.
conn_connect
=
rds_tcp_conn_connect
,
.
conn_shutdown
=
rds_tcp_conn_shutdown
,
.
inc_copy_to_user
=
rds_tcp_inc_copy_to_user
,
.
inc_purge
=
rds_tcp_inc_purge
,
.
inc_free
=
rds_tcp_inc_free
,
.
stats_info_copy
=
rds_tcp_stats_info_copy
,
.
exit
=
rds_tcp_exit
,
...
...
@@ -276,14 +274,14 @@ struct rds_transport rds_tcp_transport = {
.
t_prefer_loopback
=
1
,
};
int
__init
rds_tcp_init
(
void
)
int
rds_tcp_init
(
void
)
{
int
ret
;
rds_tcp_conn_slab
=
kmem_cache_create
(
"rds_tcp_connection"
,
sizeof
(
struct
rds_tcp_connection
),
0
,
0
,
NULL
);
if
(
rds_tcp_conn_slab
==
NULL
)
{
if
(
!
rds_tcp_conn_slab
)
{
ret
=
-
ENOMEM
;
goto
out
;
}
...
...
net/rds/tcp.h
View file @
cf0ac2b8
...
...
@@ -43,7 +43,7 @@ struct rds_tcp_statistics {
};
/* tcp.c */
int
__init
rds_tcp_init
(
void
);
int
rds_tcp_init
(
void
);
void
rds_tcp_exit
(
void
);
void
rds_tcp_tune
(
struct
socket
*
sock
);
void
rds_tcp_nonagle
(
struct
socket
*
sock
);
...
...
@@ -61,16 +61,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn);
void
rds_tcp_state_change
(
struct
sock
*
sk
);
/* tcp_listen.c */
int
__init
rds_tcp_listen_init
(
void
);
int
rds_tcp_listen_init
(
void
);
void
rds_tcp_listen_stop
(
void
);
void
rds_tcp_listen_data_ready
(
struct
sock
*
sk
,
int
bytes
);
/* tcp_recv.c */
int
__init
rds_tcp_recv_init
(
void
);
int
rds_tcp_recv_init
(
void
);
void
rds_tcp_recv_exit
(
void
);
void
rds_tcp_data_ready
(
struct
sock
*
sk
,
int
bytes
);
int
rds_tcp_recv
(
struct
rds_connection
*
conn
);
void
rds_tcp_inc_purge
(
struct
rds_incoming
*
inc
);
void
rds_tcp_inc_free
(
struct
rds_incoming
*
inc
);
int
rds_tcp_inc_copy_to_user
(
struct
rds_incoming
*
inc
,
struct
iovec
*
iov
,
size_t
size
);
...
...
@@ -81,8 +80,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn);
int
rds_tcp_xmit
(
struct
rds_connection
*
conn
,
struct
rds_message
*
rm
,
unsigned
int
hdr_off
,
unsigned
int
sg
,
unsigned
int
off
);
void
rds_tcp_write_space
(
struct
sock
*
sk
);
int
rds_tcp_xmit_cong_map
(
struct
rds_connection
*
conn
,
struct
rds_cong_map
*
map
,
unsigned
long
offset
);
/* tcp_stats.c */
DECLARE_PER_CPU
(
struct
rds_tcp_statistics
,
rds_tcp_stats
);
...
...
net/rds/tcp_connect.c
View file @
cf0ac2b8
...
...
@@ -45,7 +45,7 @@ void rds_tcp_state_change(struct sock *sk)
read_lock
(
&
sk
->
sk_callback_lock
);
conn
=
sk
->
sk_user_data
;
if
(
conn
==
NULL
)
{
if
(
!
conn
)
{
state_change
=
sk
->
sk_state_change
;
goto
out
;
}
...
...
net/rds/tcp_listen.c
View file @
cf0ac2b8
...
...
@@ -116,7 +116,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
read_lock
(
&
sk
->
sk_callback_lock
);
ready
=
sk
->
sk_user_data
;
if
(
ready
==
NULL
)
{
/* check for teardown race */
if
(
!
ready
)
{
/* check for teardown race */
ready
=
sk
->
sk_data_ready
;
goto
out
;
}
...
...
@@ -135,7 +135,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes)
ready
(
sk
,
bytes
);
}
int
__init
rds_tcp_listen_init
(
void
)
int
rds_tcp_listen_init
(
void
)
{
struct
sockaddr_in
sin
;
struct
socket
*
sock
=
NULL
;
...
...
@@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void)
struct
socket
*
sock
=
rds_tcp_listen_sock
;
struct
sock
*
sk
;
if
(
sock
==
NULL
)
if
(
!
sock
)
return
;
sk
=
sock
->
sk
;
...
...
net/rds/tcp_recv.c
View file @
cf0ac2b8
...
...
@@ -39,7 +39,7 @@
static
struct
kmem_cache
*
rds_tcp_incoming_slab
;
void
rds_tcp_inc_purge
(
struct
rds_incoming
*
inc
)
static
void
rds_tcp_inc_purge
(
struct
rds_incoming
*
inc
)
{
struct
rds_tcp_incoming
*
tinc
;
tinc
=
container_of
(
inc
,
struct
rds_tcp_incoming
,
ti_inc
);
...
...
@@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
* processing.
*/
while
(
left
)
{
if
(
tinc
==
NULL
)
{
if
(
!
tinc
)
{
tinc
=
kmem_cache_alloc
(
rds_tcp_incoming_slab
,
arg
->
gfp
);
if
(
tinc
==
NULL
)
{
if
(
!
tinc
)
{
desc
->
error
=
-
ENOMEM
;
goto
out
;
}
...
...
@@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
if
(
left
&&
tc
->
t_tinc_data_rem
)
{
clone
=
skb_clone
(
skb
,
arg
->
gfp
);
if
(
clone
==
NULL
)
{
if
(
!
clone
)
{
desc
->
error
=
-
ENOMEM
;
goto
out
;
}
...
...
@@ -326,7 +326,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
read_lock
(
&
sk
->
sk_callback_lock
);
conn
=
sk
->
sk_user_data
;
if
(
conn
==
NULL
)
{
/* check for teardown race */
if
(
!
conn
)
{
/* check for teardown race */
ready
=
sk
->
sk_data_ready
;
goto
out
;
}
...
...
@@ -342,12 +342,12 @@ void rds_tcp_data_ready(struct sock *sk, int bytes)
ready
(
sk
,
bytes
);
}
int
__init
rds_tcp_recv_init
(
void
)
int
rds_tcp_recv_init
(
void
)
{
rds_tcp_incoming_slab
=
kmem_cache_create
(
"rds_tcp_incoming"
,
sizeof
(
struct
rds_tcp_incoming
),
0
,
0
,
NULL
);
if
(
rds_tcp_incoming_slab
==
NULL
)
if
(
!
rds_tcp_incoming_slab
)
return
-
ENOMEM
;
return
0
;
}
...
...
net/rds/tcp_send.c
View file @
cf0ac2b8
...
...
@@ -76,56 +76,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
return
kernel_sendmsg
(
sock
,
&
msg
,
&
vec
,
1
,
vec
.
iov_len
);
}
/* the core send_sem serializes this with other xmit and shutdown */
int
rds_tcp_xmit_cong_map
(
struct
rds_connection
*
conn
,
struct
rds_cong_map
*
map
,
unsigned
long
offset
)
{
static
struct
rds_header
rds_tcp_map_header
=
{
.
h_flags
=
RDS_FLAG_CONG_BITMAP
,
};
struct
rds_tcp_connection
*
tc
=
conn
->
c_transport_data
;
unsigned
long
i
;
int
ret
;
int
copied
=
0
;
/* Some problem claims cpu_to_be32(constant) isn't a constant. */
rds_tcp_map_header
.
h_len
=
cpu_to_be32
(
RDS_CONG_MAP_BYTES
);
if
(
offset
<
sizeof
(
struct
rds_header
))
{
ret
=
rds_tcp_sendmsg
(
tc
->
t_sock
,
(
void
*
)
&
rds_tcp_map_header
+
offset
,
sizeof
(
struct
rds_header
)
-
offset
);
if
(
ret
<=
0
)
return
ret
;
offset
+=
ret
;
copied
=
ret
;
if
(
offset
<
sizeof
(
struct
rds_header
))
return
ret
;
}
offset
-=
sizeof
(
struct
rds_header
);
i
=
offset
/
PAGE_SIZE
;
offset
=
offset
%
PAGE_SIZE
;
BUG_ON
(
i
>=
RDS_CONG_MAP_PAGES
);
do
{
ret
=
tc
->
t_sock
->
ops
->
sendpage
(
tc
->
t_sock
,
virt_to_page
(
map
->
m_page_addrs
[
i
]),
offset
,
PAGE_SIZE
-
offset
,
MSG_DONTWAIT
);
if
(
ret
<=
0
)
break
;
copied
+=
ret
;
offset
+=
ret
;
if
(
offset
==
PAGE_SIZE
)
{
offset
=
0
;
i
++
;
}
}
while
(
i
<
RDS_CONG_MAP_PAGES
);
return
copied
?
copied
:
ret
;
}
/* the core send_sem serializes this with other xmit and shutdown */
int
rds_tcp_xmit
(
struct
rds_connection
*
conn
,
struct
rds_message
*
rm
,
unsigned
int
hdr_off
,
unsigned
int
sg
,
unsigned
int
off
)
...
...
@@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
goto
out
;
}
while
(
sg
<
rm
->
m
_nents
)
{
while
(
sg
<
rm
->
data
.
op
_nents
)
{
ret
=
tc
->
t_sock
->
ops
->
sendpage
(
tc
->
t_sock
,
sg_page
(
&
rm
->
m
_sg
[
sg
]),
rm
->
m
_sg
[
sg
].
offset
+
off
,
rm
->
m
_sg
[
sg
].
length
-
off
,
sg_page
(
&
rm
->
data
.
op
_sg
[
sg
]),
rm
->
data
.
op
_sg
[
sg
].
offset
+
off
,
rm
->
data
.
op
_sg
[
sg
].
length
-
off
,
MSG_DONTWAIT
|
MSG_NOSIGNAL
);
rdsdebug
(
"tcp sendpage %p:%u:%u ret %d
\n
"
,
(
void
*
)
sg_page
(
&
rm
->
m
_sg
[
sg
]),
rm
->
m_sg
[
sg
].
offset
+
off
,
rm
->
m
_sg
[
sg
].
length
-
off
,
rdsdebug
(
"tcp sendpage %p:%u:%u ret %d
\n
"
,
(
void
*
)
sg_page
(
&
rm
->
data
.
op
_sg
[
sg
]),
rm
->
data
.
op_sg
[
sg
].
offset
+
off
,
rm
->
data
.
op
_sg
[
sg
].
length
-
off
,
ret
);
if
(
ret
<=
0
)
break
;
off
+=
ret
;
done
+=
ret
;
if
(
off
==
rm
->
m
_sg
[
sg
].
length
)
{
if
(
off
==
rm
->
data
.
op
_sg
[
sg
].
length
)
{
off
=
0
;
sg
++
;
}
...
...
@@ -226,7 +176,7 @@ void rds_tcp_write_space(struct sock *sk)
read_lock
(
&
sk
->
sk_callback_lock
);
conn
=
sk
->
sk_user_data
;
if
(
conn
==
NULL
)
{
if
(
!
conn
)
{
write_space
=
sk
->
sk_write_space
;
goto
out
;
}
...
...
net/rds/threads.c
View file @
cf0ac2b8
...
...
@@ -61,7 +61,7 @@
*
* Transition to state DISCONNECTING/DOWN:
* - Inside the shutdown worker; synchronizes with xmit path
* through
c_send_lock
, and with connection management callbacks
* through
RDS_IN_XMIT
, and with connection management callbacks
* via c_cm_lock.
*
* For receive callbacks, we rely on the underlying transport
...
...
@@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
* We should *always* start with a random backoff; otherwise a broken connection
* will always take several iterations to be re-established.
*/
static
void
rds_queue_reconnect
(
struct
rds_connection
*
conn
)
void
rds_queue_reconnect
(
struct
rds_connection
*
conn
)
{
unsigned
long
rand
;
...
...
@@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work)
}
}
void
rds_shutdown_worker
(
struct
work_struct
*
work
)
{
struct
rds_connection
*
conn
=
container_of
(
work
,
struct
rds_connection
,
c_down_w
);
/* shut it down unless it's down already */
if
(
!
rds_conn_transition
(
conn
,
RDS_CONN_DOWN
,
RDS_CONN_DOWN
))
{
/*
* Quiesce the connection mgmt handlers before we start tearing
* things down. We don't hold the mutex for the entire
* duration of the shutdown operation, else we may be
* deadlocking with the CM handler. Instead, the CM event
* handler is supposed to check for state DISCONNECTING
*/
mutex_lock
(
&
conn
->
c_cm_lock
);
if
(
!
rds_conn_transition
(
conn
,
RDS_CONN_UP
,
RDS_CONN_DISCONNECTING
)
&&
!
rds_conn_transition
(
conn
,
RDS_CONN_ERROR
,
RDS_CONN_DISCONNECTING
))
{
rds_conn_error
(
conn
,
"shutdown called in state %d
\n
"
,
atomic_read
(
&
conn
->
c_state
));
mutex_unlock
(
&
conn
->
c_cm_lock
);
return
;
}
mutex_unlock
(
&
conn
->
c_cm_lock
);
mutex_lock
(
&
conn
->
c_send_lock
);
conn
->
c_trans
->
conn_shutdown
(
conn
);
rds_conn_reset
(
conn
);
mutex_unlock
(
&
conn
->
c_send_lock
);
if
(
!
rds_conn_transition
(
conn
,
RDS_CONN_DISCONNECTING
,
RDS_CONN_DOWN
))
{
/* This can happen - eg when we're in the middle of tearing
* down the connection, and someone unloads the rds module.
* Quite reproduceable with loopback connections.
* Mostly harmless.
*/
rds_conn_error
(
conn
,
"%s: failed to transition to state DOWN, "
"current state is %d
\n
"
,
__func__
,
atomic_read
(
&
conn
->
c_state
));
return
;
}
}
/* Then reconnect if it's still live.
* The passive side of an IB loopback connection is never added
* to the conn hash, so we never trigger a reconnect on this
* conn - the reconnect is always triggered by the active peer. */
cancel_delayed_work
(
&
conn
->
c_conn_w
);
if
(
!
hlist_unhashed
(
&
conn
->
c_hash_node
))
rds_queue_reconnect
(
conn
);
}
void
rds_send_worker
(
struct
work_struct
*
work
)
{
struct
rds_connection
*
conn
=
container_of
(
work
,
struct
rds_connection
,
c_send_w
.
work
);
...
...
@@ -252,15 +200,22 @@ void rds_recv_worker(struct work_struct *work)
}
}
void
rds_shutdown_worker
(
struct
work_struct
*
work
)
{
struct
rds_connection
*
conn
=
container_of
(
work
,
struct
rds_connection
,
c_down_w
);
rds_conn_shutdown
(
conn
);
}
void
rds_threads_exit
(
void
)
{
destroy_workqueue
(
rds_wq
);
}
int
__init
rds_threads_init
(
void
)
int
rds_threads_init
(
void
)
{
rds_wq
=
create_workqueue
(
"krdsd"
);
if
(
rds_wq
==
NULL
)
rds_wq
=
create_
singlethread_
workqueue
(
"krdsd"
);
if
(
!
rds_wq
)
return
-
ENOMEM
;
return
0
;
...
...
net/rds/transport.c
View file @
cf0ac2b8
...
...
@@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans)
}
EXPORT_SYMBOL_GPL
(
rds_trans_unregister
);
void
rds_trans_put
(
struct
rds_transport
*
trans
)
{
if
(
trans
&&
trans
->
t_owner
)
module_put
(
trans
->
t_owner
);
}
struct
rds_transport
*
rds_trans_get_preferred
(
__be32
addr
)
{
struct
rds_transport
*
ret
=
NULL
;
int
i
;
struct
rds_transport
*
trans
;
unsigned
int
i
;
if
(
IN_LOOPBACK
(
ntohl
(
addr
)))
return
&
rds_loop_transport
;
down_read
(
&
rds_trans_sem
);
for
(
i
=
0
;
i
<
RDS_TRANS_COUNT
;
i
++
)
{
if
(
transports
[
i
]
&&
(
transports
[
i
]
->
laddr_check
(
addr
)
==
0
))
{
ret
=
transports
[
i
];
for
(
i
=
0
;
i
<
RDS_TRANS_COUNT
;
i
++
)
{
trans
=
transports
[
i
];
if
(
trans
&&
(
trans
->
laddr_check
(
addr
)
==
0
)
&&
(
!
trans
->
t_owner
||
try_module_get
(
trans
->
t_owner
)))
{
ret
=
trans
;
break
;
}
}
...
...
net/rds/xlist.h
0 → 100644
View file @
cf0ac2b8
#ifndef _LINUX_XLIST_H
#define _LINUX_XLIST_H
#include <linux/stddef.h>
#include <linux/poison.h>
#include <linux/prefetch.h>
#include <asm/system.h>
struct
xlist_head
{
struct
xlist_head
*
next
;
};
static
inline
void
INIT_XLIST_HEAD
(
struct
xlist_head
*
list
)
{
list
->
next
=
NULL
;
}
static
inline
int
xlist_empty
(
struct
xlist_head
*
head
)
{
return
head
->
next
==
NULL
;
}
static
inline
void
xlist_add
(
struct
xlist_head
*
new
,
struct
xlist_head
*
tail
,
struct
xlist_head
*
head
)
{
struct
xlist_head
*
cur
;
struct
xlist_head
*
check
;
while
(
1
)
{
cur
=
head
->
next
;
tail
->
next
=
cur
;
check
=
cmpxchg
(
&
head
->
next
,
cur
,
new
);
if
(
check
==
cur
)
break
;
}
}
static
inline
struct
xlist_head
*
xlist_del_head
(
struct
xlist_head
*
head
)
{
struct
xlist_head
*
cur
;
struct
xlist_head
*
check
;
struct
xlist_head
*
next
;
while
(
1
)
{
cur
=
head
->
next
;
if
(
!
cur
)
goto
out
;
next
=
cur
->
next
;
check
=
cmpxchg
(
&
head
->
next
,
cur
,
next
);
if
(
check
==
cur
)
goto
out
;
}
out:
return
cur
;
}
static
inline
struct
xlist_head
*
xlist_del_head_fast
(
struct
xlist_head
*
head
)
{
struct
xlist_head
*
cur
;
cur
=
head
->
next
;
if
(
!
cur
)
return
NULL
;
head
->
next
=
cur
->
next
;
return
cur
;
}
static
inline
void
xlist_splice
(
struct
xlist_head
*
list
,
struct
xlist_head
*
head
)
{
struct
xlist_head
*
cur
;
WARN_ON
(
head
->
next
);
cur
=
xchg
(
&
list
->
next
,
NULL
);
head
->
next
=
cur
;
}
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment