Commit 55782138 authored by Li Zefan's avatar Li Zefan Committed by Steven Rostedt

tracing/events: convert block trace points to TRACE_EVENT()

TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
these new capabilities to this tracepoint:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions
  ...

Cons:

  - no dev_t info for the output of plug, unplug_timer and unplug_io events.
    no dev_t info for getrq and sleeprq events if bio == NULL.
    no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.

    This is mainly because we can't get the deivce from a request queue.
    But this may change in the future.

  - A packet command is converted to a string in TP_assign, not TP_print.
    While blktrace do the convertion just before output.

    Since pc requests should be rather rare, this is not a big issue.

  - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT
    has a unique format, which means we have some unused data in a trace entry.

    The overhead is minimized by using __dynamic_array() instead of __array().

I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing:

      dd                   dd + ioctl blktrace       dd + TRACE_EVENT (splice)
1     7.36s, 42.7 MB/s     7.50s, 42.0 MB/s          7.41s, 42.5 MB/s
2     7.43s, 42.3 MB/s     7.48s, 42.1 MB/s          7.43s, 42.4 MB/s
3     7.38s, 42.6 MB/s     7.45s, 42.2 MB/s          7.41s, 42.5 MB/s

So the overhead of tracing is very small, and no regression when using
those trace events vs blktrace.

And the binary output of TRACE_EVENT is much smaller than blktrace:

 # ls -l -h
 -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0
 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1
 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out

Following are some comparisons between TRACE_EVENT and blktrace:

plug:
  kjournald-480   [000]   303.084981: block_plug: [kjournald]
  kjournald-480   [000]   303.084981:   8,0    P   N [kjournald]

unplug_io:
  kblockd/0-118   [000]   300.052973: block_unplug_io: [kblockd/0] 1
  kblockd/0-118   [000]   300.052974:   8,0    U   N [kblockd/0] 1

remap:
  kjournald-480   [000]   303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384
  kjournald-480   [000]   303.085043:   8,0    A   W 102736992 + 8 <- (8,8) 33384

bio_backmerge:
  kjournald-480   [000]   303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald]
  kjournald-480   [000]   303.085086:   8,0    M   W 102737032 + 8 [kjournald]

getrq:
  kjournald-480   [000]   303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084975:   8,0    G   W 102736984 + 8 [kjournald]

  bash-2066  [001]  1072.953770:   8,0    G   N [bash]
  bash-2066  [001]  1072.953773: block_getrq: 0,0 N 0 + 0 [bash]

rq_complete:
  konsole-2065  [001]   300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0]
  konsole-2065  [001]   300.053191:   8,0    C   W 103669040 + 16 [0]

  ksoftirqd/1-7   [001]  1072.953811:   8,0    C   N (5a 00 08 00 00 00 00 00 24 00) [0]
  ksoftirqd/1-7   [001]  1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0]

rq_insert:
  kjournald-480   [000]   303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084986:   8,0    I   W 102736984 + 8 [kjournald]

Changelog from v2 -> v3:

- use the newly introduced __dynamic_array().

Changelog from v1 -> v2:

- use __string() instead of __array() to minimize the memory required
  to store hex dump of rq->cmd().

- support large pc requests.

- add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT.

- some cleanups.
Signed-off-by: default avatarLi Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com>
Signed-off-by: default avatarSteven Rostedt <rostedt@goodmis.org>
parent f57a8a19
......@@ -28,22 +28,14 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/blktrace_api.h>
#include <linux/fault-inject.h>
#include <trace/block.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
#include "blk.h"
DEFINE_TRACE(block_plug);
DEFINE_TRACE(block_unplug_io);
DEFINE_TRACE(block_unplug_timer);
DEFINE_TRACE(block_getrq);
DEFINE_TRACE(block_sleeprq);
DEFINE_TRACE(block_rq_requeue);
DEFINE_TRACE(block_bio_backmerge);
DEFINE_TRACE(block_bio_frontmerge);
DEFINE_TRACE(block_bio_queue);
DEFINE_TRACE(block_rq_complete);
DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */
EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
static int __make_request(struct request_queue *q, struct bio *bio);
......
......@@ -33,17 +33,16 @@
#include <linux/compiler.h>
#include <linux/delay.h>
#include <linux/blktrace_api.h>
#include <trace/block.h>
#include <linux/hash.h>
#include <linux/uaccess.h>
#include <trace/events/block.h>
#include "blk.h"
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
DEFINE_TRACE(block_rq_abort);
/*
* Merge hash stuff.
*/
......@@ -55,9 +54,6 @@ static const int elv_hash_shift = 6;
#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
DEFINE_TRACE(block_rq_insert);
DEFINE_TRACE(block_rq_issue);
/*
* Query io scheduler to see if the current process issuing bio may be
* merged with rq.
......
......@@ -20,7 +20,8 @@
#include <linux/idr.h>
#include <linux/hdreg.h>
#include <linux/blktrace_api.h>
#include <trace/block.h>
#include <trace/events/block.h>
#define DM_MSG_PREFIX "core"
......@@ -53,8 +54,6 @@ struct dm_target_io {
union map_info info;
};
DEFINE_TRACE(block_bio_complete);
/*
* For request-based dm.
* One of these is allocated per request.
......
......@@ -26,10 +26,9 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/blktrace_api.h>
#include <trace/block.h>
#include <scsi/sg.h> /* for struct sg_iovec */
DEFINE_TRACE(block_split);
#include <trace/events/block.h>
/*
* Test patch to inline a certain number of bi_io_vec's inside the bio
......
......@@ -218,5 +218,18 @@ static inline int blk_trace_init_sysfs(struct device *dev)
#endif /* CONFIG_BLK_DEV_IO_TRACE */
#ifdef CONFIG_EVENT_TRACING
static inline int blk_cmd_buf_len(struct request *rq)
{
return blk_pc_request(rq) ? rq->cmd_len * 3 : 1;
}
extern void blk_dump_cmd(char *buf, struct request *rq);
extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
#endif /* CONFIG_EVENT_TRACING */
#endif /* __KERNEL__ */
#endif
#ifndef _TRACE_BLOCK_H
#define _TRACE_BLOCK_H
#include <linux/blkdev.h>
#include <linux/tracepoint.h>
DECLARE_TRACE(block_rq_abort,
TP_PROTO(struct request_queue *q, struct request *rq),
TP_ARGS(q, rq));
DECLARE_TRACE(block_rq_insert,
TP_PROTO(struct request_queue *q, struct request *rq),
TP_ARGS(q, rq));
DECLARE_TRACE(block_rq_issue,
TP_PROTO(struct request_queue *q, struct request *rq),
TP_ARGS(q, rq));
DECLARE_TRACE(block_rq_requeue,
TP_PROTO(struct request_queue *q, struct request *rq),
TP_ARGS(q, rq));
DECLARE_TRACE(block_rq_complete,
TP_PROTO(struct request_queue *q, struct request *rq),
TP_ARGS(q, rq));
DECLARE_TRACE(block_bio_bounce,
TP_PROTO(struct request_queue *q, struct bio *bio),
TP_ARGS(q, bio));
DECLARE_TRACE(block_bio_complete,
TP_PROTO(struct request_queue *q, struct bio *bio),
TP_ARGS(q, bio));
DECLARE_TRACE(block_bio_backmerge,
TP_PROTO(struct request_queue *q, struct bio *bio),
TP_ARGS(q, bio));
DECLARE_TRACE(block_bio_frontmerge,
TP_PROTO(struct request_queue *q, struct bio *bio),
TP_ARGS(q, bio));
DECLARE_TRACE(block_bio_queue,
TP_PROTO(struct request_queue *q, struct bio *bio),
TP_ARGS(q, bio));
DECLARE_TRACE(block_getrq,
TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
TP_ARGS(q, bio, rw));
DECLARE_TRACE(block_sleeprq,
TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
TP_ARGS(q, bio, rw));
DECLARE_TRACE(block_plug,
TP_PROTO(struct request_queue *q),
TP_ARGS(q));
DECLARE_TRACE(block_unplug_timer,
TP_PROTO(struct request_queue *q),
TP_ARGS(q));
DECLARE_TRACE(block_unplug_io,
TP_PROTO(struct request_queue *q),
TP_ARGS(q));
DECLARE_TRACE(block_split,
TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
TP_ARGS(q, bio, pdu));
DECLARE_TRACE(block_remap,
TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
sector_t from),
TP_ARGS(q, bio, dev, from));
#endif
This diff is collapsed.
......@@ -45,7 +45,10 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
obj-$(CONFIG_POWER_TRACER) += trace_power.o
obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
ifeq ($(CONFIG_BLOCK),y)
obj-$(CONFIG_EVENT_TRACING) += blktrace.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events.o
obj-$(CONFIG_EVENT_TRACING) += trace_export.o
obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
......
......@@ -23,10 +23,14 @@
#include <linux/mutex.h>
#include <linux/debugfs.h>
#include <linux/time.h>
#include <trace/block.h>
#include <linux/uaccess.h>
#include <trace/events/block.h>
#include "trace_output.h"
#ifdef CONFIG_BLK_DEV_IO_TRACE
static unsigned int blktrace_seq __read_mostly = 1;
static struct trace_array *blk_tr;
......@@ -1658,3 +1662,75 @@ int blk_trace_init_sysfs(struct device *dev)
return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
}
#endif /* CONFIG_BLK_DEV_IO_TRACE */
#ifdef CONFIG_EVENT_TRACING
void blk_dump_cmd(char *buf, struct request *rq)
{
int i, end;
int len = rq->cmd_len;
unsigned char *cmd = rq->cmd;
if (!blk_pc_request(rq)) {
buf[0] = '\0';
return;
}
for (end = len - 1; end >= 0; end--)
if (cmd[end])
break;
end++;
for (i = 0; i < len; i++) {
buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
if (i == end && end != len - 1) {
sprintf(buf, " ..");
break;
}
}
}
void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
{
int i = 0;
if (rw & WRITE)
rwbs[i++] = 'W';
else if (rw & 1 << BIO_RW_DISCARD)
rwbs[i++] = 'D';
else if (bytes)
rwbs[i++] = 'R';
else
rwbs[i++] = 'N';
if (rw & 1 << BIO_RW_AHEAD)
rwbs[i++] = 'A';
if (rw & 1 << BIO_RW_BARRIER)
rwbs[i++] = 'B';
if (rw & 1 << BIO_RW_SYNCIO)
rwbs[i++] = 'S';
if (rw & 1 << BIO_RW_META)
rwbs[i++] = 'M';
rwbs[i] = '\0';
}
void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
{
int rw = rq->cmd_flags & 0x03;
int bytes;
if (blk_discard_rq(rq))
rw |= (1 << BIO_RW_DISCARD);
if (blk_pc_request(rq))
bytes = rq->data_len;
else
bytes = rq->hard_nr_sectors << 9;
blk_fill_rwbs(rwbs, rw, bytes);
}
#endif /* CONFIG_EVENT_TRACING */
......@@ -14,16 +14,15 @@
#include <linux/hash.h>
#include <linux/highmem.h>
#include <linux/blktrace_api.h>
#include <trace/block.h>
#include <asm/tlbflush.h>
#include <trace/events/block.h>
#define POOL_SIZE 64
#define ISA_POOL_SIZE 16
static mempool_t *page_pool, *isa_page_pool;
DEFINE_TRACE(block_bio_bounce);
#ifdef CONFIG_HIGHMEM
static __init int init_emergency_pool(void)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment