Commit 50932a22 authored by Andreas Dilger's avatar Andreas Dilger Committed by Greg Kroah-Hartman

staging: lustre: ptlrpc: quiet errors on initial connection

It may be that a client or MDS is trying to connect to a target (OST
or peer MDT) before that target is finished setup. Rather than
spamming the console logs during initial connection, only print a
console error message if there are repeated failures trying to
connect to the target, which may indicate an error on that node.
Signed-off-by: default avatarAndreas Dilger <andreas.dilger@intel.com>
Signed-off-by: default avatarBobi Jam <bobijam.xu@intel.com>
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-3456
Reviewed-on: http://review.whamcloud.com/10057Reviewed-by: default avatarBobi Jam <bobijam@gmail.com>
Reviewed-by: default avatarBob Glossman <bob.glossman@intel.com>
Reviewed-by: default avatarOleg Drokin <oleg.drokin@intel.com>
Signed-off-by: default avatarJames Simmons <jsimmons@infradead.org>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 08fd0346
...@@ -1075,36 +1075,42 @@ static int ptlrpc_import_delay_req(struct obd_import *imp, ...@@ -1075,36 +1075,42 @@ static int ptlrpc_import_delay_req(struct obd_import *imp,
} }
/** /**
* Decide if the error message regarding provided request \a req * Decide if the error message should be printed to the console or not.
* should be printed to the console or not. * Makes its decision based on request type, status, and failure frequency.
* Makes it's decision on request status and other properties. *
* Returns 1 to print error on the system console or 0 if not. * \param[in] req request that failed and may need a console message
*
* \retval false if no message should be printed
* \retval true if console message should be printed
*/ */
static int ptlrpc_console_allow(struct ptlrpc_request *req) static bool ptlrpc_console_allow(struct ptlrpc_request *req)
{ {
__u32 opc; __u32 opc;
int err;
LASSERT(req->rq_reqmsg); LASSERT(req->rq_reqmsg);
opc = lustre_msg_get_opc(req->rq_reqmsg); opc = lustre_msg_get_opc(req->rq_reqmsg);
/* /* Suppress particular reconnect errors which are to be expected. */
* Suppress particular reconnect errors which are to be expected. No if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) {
* errors are suppressed for the initial connection on an import int err;
*/
if ((lustre_handle_is_used(&req->rq_import->imp_remote_handle)) &&
(opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT)) {
/* Suppress timed out reconnect requests */ /* Suppress timed out reconnect requests */
if (req->rq_timedout) if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) ||
return 0; req->rq_timedout)
return false;
/* Suppress unavailable/again reconnect requests */ /*
* Suppress most unavailable/again reconnect requests, but
* print occasionally so it is clear client is trying to
* connect to a server where no target is running.
*/
err = lustre_msg_get_status(req->rq_repmsg); err = lustre_msg_get_status(req->rq_repmsg);
if (err == -ENODEV || err == -EAGAIN) if ((err == -ENODEV || err == -EAGAIN) &&
return 0; req->rq_import->imp_conn_cnt % 30 != 20)
return false;
} }
return 1; return true;
} }
/** /**
...@@ -1118,14 +1124,14 @@ static int ptlrpc_check_status(struct ptlrpc_request *req) ...@@ -1118,14 +1124,14 @@ static int ptlrpc_check_status(struct ptlrpc_request *req)
err = lustre_msg_get_status(req->rq_repmsg); err = lustre_msg_get_status(req->rq_repmsg);
if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) {
struct obd_import *imp = req->rq_import; struct obd_import *imp = req->rq_import;
lnet_nid_t nid = imp->imp_connection->c_peer.nid;
__u32 opc = lustre_msg_get_opc(req->rq_reqmsg); __u32 opc = lustre_msg_get_opc(req->rq_reqmsg);
if (ptlrpc_console_allow(req)) if (ptlrpc_console_allow(req))
LCONSOLE_ERROR_MSG(0x011, "%s: Communicating with %s, operation %s failed with %d.\n", LCONSOLE_ERROR_MSG(0x011, "%s: operation %s to node %s failed: rc = %d\n",
imp->imp_obd->obd_name, imp->imp_obd->obd_name,
libcfs_nid2str( ll_opcode2str(opc),
imp->imp_connection->c_peer.nid), libcfs_nid2str(nid), err);
ll_opcode2str(opc), err);
return err < 0 ? err : -EINVAL; return err < 0 ? err : -EINVAL;
} }
...@@ -1282,7 +1288,7 @@ static int after_reply(struct ptlrpc_request *req) ...@@ -1282,7 +1288,7 @@ static int after_reply(struct ptlrpc_request *req)
* some reason. Try to reconnect, and if that fails, punt to * some reason. Try to reconnect, and if that fails, punt to
* the upcall. * the upcall.
*/ */
if (ll_rpc_recoverable_error(rc)) { if (ptlrpc_recoverable_error(rc)) {
if (req->rq_send_state != LUSTRE_IMP_FULL || if (req->rq_send_state != LUSTRE_IMP_FULL ||
imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
return rc; return rc;
......
...@@ -270,7 +270,7 @@ void sptlrpc_conf_fini(void); ...@@ -270,7 +270,7 @@ void sptlrpc_conf_fini(void);
int sptlrpc_init(void); int sptlrpc_init(void);
void sptlrpc_fini(void); void sptlrpc_fini(void);
static inline int ll_rpc_recoverable_error(int rc) static inline bool ptlrpc_recoverable_error(int rc)
{ {
return (rc == -ENOTCONN || rc == -ENODEV); return (rc == -ENOTCONN || rc == -ENODEV);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment