Commit f654c792 authored by yonghong-song's avatar yonghong-song Committed by GitHub

Merge pull request #1518 from liu-song-6/perf_event_open_v02

Try use new API to create [k,u]probe with perf_event_open
parents 3558d5a8 0e9ed200
...@@ -536,33 +536,99 @@ int bpf_attach_socket(int sock, int prog) { ...@@ -536,33 +536,99 @@ int bpf_attach_socket(int sock, int prog) {
return setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog, sizeof(prog)); return setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog, sizeof(prog));
} }
static int bpf_attach_tracing_event(int progfd, const char *event_path, #define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
struct perf_reader *reader, int pid) { static int bpf_find_probe_type(const char *event_type)
int efd, pfd, cpu = 0; {
ssize_t bytes; int fd;
int ret;
char buf[PATH_MAX]; char buf[PATH_MAX];
struct perf_event_attr attr = {};
snprintf(buf, sizeof(buf), "%s/id", event_path); ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
efd = open(buf, O_RDONLY, 0); if (ret < 0 || ret >= sizeof(buf))
if (efd < 0) {
fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
return -1; return -1;
}
bytes = read(efd, buf, sizeof(buf)); fd = open(buf, O_RDONLY);
if (bytes <= 0 || bytes >= sizeof(buf)) { if (fd < 0)
fprintf(stderr, "read(%s): %s\n", buf, strerror(errno)); return -1;
close(efd); ret = read(fd, buf, sizeof(buf));
close(fd);
if (ret < 0 || ret >= sizeof(buf))
return -1;
errno = 0;
ret = (int)strtol(buf, NULL, 10);
return errno ? -1 : ret;
}
#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
static int bpf_get_retprobe_bit(const char *event_type)
{
int fd;
int ret;
char buf[PATH_MAX];
ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
if (ret < 0 || ret >= sizeof(buf))
return -1;
fd = open(buf, O_RDONLY);
if (fd < 0)
return -1;
ret = read(fd, buf, sizeof(buf));
close(fd);
if (ret < 0 || ret >= sizeof(buf))
return -1;
if (strlen(buf) < strlen("config:"))
return -1;
errno = 0;
ret = (int)strtol(buf + strlen("config:"), NULL, 10);
return errno ? -1 : ret;
}
/*
* new kernel API allows creating [k,u]probe with perf_event_open, which
* makes it easier to clean up the [k,u]probe. This function tries to
* create pfd with the new API.
*/
static int bpf_try_perf_event_open_with_probe(const char *name, uint64_t offs,
int pid, char *event_type, int is_return)
{
struct perf_event_attr attr = {};
int type = bpf_find_probe_type(event_type);
int is_return_bit = bpf_get_retprobe_bit(event_type);
int cpu = 0;
if (type < 0 || is_return_bit < 0)
return -1; return -1;
}
close(efd);
buf[bytes] = '\0';
attr.config = strtol(buf, NULL, 0);
attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN; attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
attr.sample_period = 1; attr.sample_period = 1;
attr.wakeup_events = 1; attr.wakeup_events = 1;
if (is_return)
attr.config |= 1 << is_return_bit;
/*
* struct perf_event_attr in latest perf_event.h has the following
* extension to config1 and config2. To keep bcc compatibe with
* older perf_event.h, we use config1 and config2 here instead of
* kprobe_func, uprobe_path, kprobe_addr, and probe_offset.
*
* union {
* __u64 bp_addr;
* __u64 kprobe_func;
* __u64 uprobe_path;
* __u64 config1;
* };
* union {
* __u64 bp_len;
* __u64 kprobe_addr;
* __u64 probe_offset;
* __u64 config2;
* };
*/
attr.config2 = offs; /* config2 here is kprobe_addr or probe_offset */
attr.size = sizeof(attr);
attr.type = type;
/* config1 here is kprobe_func or uprobe_path */
attr.config1 = ptr_to_u64((void *)name);
// PID filter is only possible for uprobe events. // PID filter is only possible for uprobe events.
if (pid < 0) if (pid < 0)
pid = -1; pid = -1;
...@@ -571,10 +637,53 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path, ...@@ -571,10 +637,53 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path,
// Tracing events do not do CPU filtering in any cases. // Tracing events do not do CPU filtering in any cases.
if (pid != -1) if (pid != -1)
cpu = -1; cpu = -1;
pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); return syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */,
PERF_FLAG_FD_CLOEXEC);
}
static int bpf_attach_tracing_event(int progfd, const char *event_path,
struct perf_reader *reader, int pid,
int pfd)
{
int efd, cpu = 0;
ssize_t bytes;
char buf[PATH_MAX];
struct perf_event_attr attr = {};
if (pfd < 0) { if (pfd < 0) {
fprintf(stderr, "perf_event_open(%s/id): %s\n", event_path, strerror(errno)); snprintf(buf, sizeof(buf), "%s/id", event_path);
return -1; efd = open(buf, O_RDONLY, 0);
if (efd < 0) {
fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
return -1;
}
bytes = read(efd, buf, sizeof(buf));
if (bytes <= 0 || bytes >= sizeof(buf)) {
fprintf(stderr, "read(%s): %s\n", buf, strerror(errno));
close(efd);
return -1;
}
close(efd);
buf[bytes] = '\0';
attr.config = strtol(buf, NULL, 0);
attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
attr.sample_period = 1;
attr.wakeup_events = 1;
// PID filter is only possible for uprobe events.
if (pid < 0)
pid = -1;
// perf_event_open API doesn't allow both pid and cpu to be -1.
// So only set it to -1 when PID is not -1.
// Tracing events do not do CPU filtering in any cases.
if (pid != -1)
cpu = -1;
pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
if (pfd < 0) {
fprintf(stderr, "perf_event_open(%s/id): %s\n", event_path, strerror(errno));
return -1;
}
} }
perf_reader_set_fd(reader, pfd); perf_reader_set_fd(reader, pfd);
...@@ -598,6 +707,7 @@ void *bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type, ...@@ -598,6 +707,7 @@ void *bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type,
perf_reader_cb cb, void *cb_cookie) perf_reader_cb cb, void *cb_cookie)
{ {
int kfd; int kfd;
int pfd;
char buf[256]; char buf[256];
char event_alias[128]; char event_alias[128];
struct perf_reader *reader = NULL; struct perf_reader *reader = NULL;
...@@ -607,26 +717,30 @@ void *bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type, ...@@ -607,26 +717,30 @@ void *bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type,
if (!reader) if (!reader)
goto error; goto error;
snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type); pfd = bpf_try_perf_event_open_with_probe(fn_name, 0, -1, event_type,
kfd = open(buf, O_WRONLY | O_APPEND, 0); attach_type != BPF_PROBE_ENTRY);
if (kfd < 0) {
fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
goto error;
}
snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid()); if (pfd < 0) {
snprintf(buf, sizeof(buf), "%c:%ss/%s %s", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r', snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
event_type, event_alias, fn_name); kfd = open(buf, O_WRONLY | O_APPEND, 0);
if (write(kfd, buf, strlen(buf)) < 0) { if (kfd < 0) {
if (errno == EINVAL) fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
fprintf(stderr, "check dmesg output for possible cause\n"); goto error;
}
snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
snprintf(buf, sizeof(buf), "%c:%ss/%s %s", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r',
event_type, event_alias, fn_name);
if (write(kfd, buf, strlen(buf)) < 0) {
if (errno == EINVAL)
fprintf(stderr, "check dmesg output for possible cause\n");
close(kfd);
goto error;
}
close(kfd); close(kfd);
goto error; snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
} }
close(kfd); if (bpf_attach_tracing_event(progfd, buf, reader, -1 /* PID */, pfd) < 0)
snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
if (bpf_attach_tracing_event(progfd, buf, reader, -1 /* PID */) < 0)
goto error; goto error;
return reader; return reader;
...@@ -708,42 +822,47 @@ void *bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, ...@@ -708,42 +822,47 @@ void *bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type,
struct perf_reader *reader = NULL; struct perf_reader *reader = NULL;
static char *event_type = "uprobe"; static char *event_type = "uprobe";
int res, kfd = -1, ns_fd = -1; int res, kfd = -1, ns_fd = -1;
int pfd;
reader = perf_reader_new(cb, NULL, NULL, cb_cookie, probe_perf_reader_page_cnt); reader = perf_reader_new(cb, NULL, NULL, cb_cookie, probe_perf_reader_page_cnt);
if (!reader) if (!reader)
goto error; goto error;
snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type); pfd = bpf_try_perf_event_open_with_probe(binary_path, offset, pid, event_type,
kfd = open(buf, O_WRONLY | O_APPEND, 0); attach_type != BPF_PROBE_ENTRY);
if (kfd < 0) { if (pfd < 0) {
fprintf(stderr, "open(%s): %s\n", buf, strerror(errno)); snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
goto error; kfd = open(buf, O_WRONLY | O_APPEND, 0);
} if (kfd < 0) {
fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
goto error;
}
res = snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid()); res = snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid());
if (res < 0 || res >= sizeof(event_alias)) { if (res < 0 || res >= sizeof(event_alias)) {
fprintf(stderr, "Event name (%s) is too long for buffer\n", ev_name); fprintf(stderr, "Event name (%s) is too long for buffer\n", ev_name);
goto error; goto error;
} }
res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r', res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r',
event_type, event_alias, binary_path, offset); event_type, event_alias, binary_path, offset);
if (res < 0 || res >= sizeof(buf)) { if (res < 0 || res >= sizeof(buf)) {
fprintf(stderr, "Event alias (%s) too long for buffer\n", event_alias); fprintf(stderr, "Event alias (%s) too long for buffer\n", event_alias);
goto error; goto error;
} }
ns_fd = enter_mount_ns(pid); ns_fd = enter_mount_ns(pid);
if (write(kfd, buf, strlen(buf)) < 0) { if (write(kfd, buf, strlen(buf)) < 0) {
if (errno == EINVAL) if (errno == EINVAL)
fprintf(stderr, "check dmesg output for possible cause\n"); fprintf(stderr, "check dmesg output for possible cause\n");
goto error; goto error;
} }
close(kfd); close(kfd);
exit_mount_ns(ns_fd); exit_mount_ns(ns_fd);
ns_fd = -1; ns_fd = -1;
snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias); snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias);
if (bpf_attach_tracing_event(progfd, buf, reader, pid) < 0) }
if (bpf_attach_tracing_event(progfd, buf, reader, pid, pfd) < 0)
goto error; goto error;
return reader; return reader;
...@@ -758,8 +877,43 @@ error: ...@@ -758,8 +877,43 @@ error:
static int bpf_detach_probe(const char *ev_name, const char *event_type) static int bpf_detach_probe(const char *ev_name, const char *event_type)
{ {
int kfd, res; int kfd = -1, res;
char buf[PATH_MAX]; char buf[PATH_MAX];
int found_event = 0;
size_t bufsize = 0;
char *cptr = NULL;
FILE *fp;
/*
* For [k,u]probe created with perf_event_open (on newer kernel), it is
* not necessary to clean it up in [k,u]probe_events. We first look up
* the %s_bcc_%d line in [k,u]probe_events. If the event is not found,
* it is safe to skip the cleaning up process (write -:... to the file).
*/
snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
fp = fopen(buf, "r");
if (!fp) {
fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
goto error;
}
res = snprintf(buf, sizeof(buf), "%ss/%s_bcc_%d", event_type, ev_name, getpid());
if (res < 0 || res >= sizeof(buf)) {
fprintf(stderr, "snprintf(%s): %d\n", ev_name, res);
goto error;
}
while (getline(&cptr, &bufsize, fp) != -1)
if (strstr(cptr, buf) != NULL) {
found_event = 1;
break;
}
fclose(fp);
fp = NULL;
if (!found_event)
return 0;
snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type); snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
kfd = open(buf, O_WRONLY | O_APPEND, 0); kfd = open(buf, O_WRONLY | O_APPEND, 0);
if (kfd < 0) { if (kfd < 0) {
...@@ -783,6 +937,8 @@ static int bpf_detach_probe(const char *ev_name, const char *event_type) ...@@ -783,6 +937,8 @@ static int bpf_detach_probe(const char *ev_name, const char *event_type)
error: error:
if (kfd >= 0) if (kfd >= 0)
close(kfd); close(kfd);
if (fp)
fclose(fp);
return -1; return -1;
} }
...@@ -809,7 +965,7 @@ void *bpf_attach_tracepoint(int progfd, const char *tp_category, ...@@ -809,7 +965,7 @@ void *bpf_attach_tracepoint(int progfd, const char *tp_category,
snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%s/%s", snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%s/%s",
tp_category, tp_name); tp_category, tp_name);
if (bpf_attach_tracing_event(progfd, buf, reader, -1 /* PID */) < 0) if (bpf_attach_tracing_event(progfd, buf, reader, -1 /* PID */, -1 /* pfd */) < 0)
goto error; goto error;
return reader; return reader;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment