Commit e822a818 authored by Omar Sandoval's avatar Omar Sandoval Committed by 4ast

Add new mountsnoop tool (#750)

Filesystem mounting and unmounting affects an entire system, so this is
a great candidate for system-wide tracing. mountsnoop.py watches all
mounts and unmounts and is also mount namespace-aware, which is a
requirement for working with containers.
Signed-off-by: default avatarOmar Sandoval <osandov@fb.com>
parent accd4cf5
.TH mountsnoop 8 "2016-10-14" "USER COMMANDS"
.SH NAME
mountsnoop \- Trace mount() and umount() syscalls. Uses Linux eBPF/bcc.
.SH SYNOPSIS
.B mountsnoop
.SH DESCRIPTION
mountsnoop traces the mount() and umount() syscalls, showing which processes
are mounting and unmounting filesystems in what mount namespaces. This can be
useful for troubleshooting system and container setup.
This works by tracing the kernel sys_mount() and sys_umount() functions using
dynamic tracing, and will need updating to match any changes to this function.
This makes use of a Linux 4.4 feature (bpf_perf_event_output()).
Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS
CONFIG_BPF and bcc.
.SH FIELDS
.TP
COMM
Process name
.TP
PID
Process ID
.TP
TID
Thread ID
.TP
MNT_NS
Mount namespace inode number
.TP
CALL
System call, arguments, and return value
.SH OVERHEAD
This traces the kernel mount and umount functions and prints output for each
event. As the rate of these calls is generally expected to be very low, the
overhead is also expected to be negligible. If your system calls mount() and
umount() at a high rate, then test and understand overhead before use.
.SH SOURCE
This is from bcc.
.IP
https://github.com/iovisor/bcc
.PP
Also look in the bcc distribution for a companion _examples.txt file containing
example usage, output, and commentary for this tool.
.SH OS
Linux
.SH STABILITY
Unstable - in development.
.SH AUTHOR
Omar Sandoval
.SH SEE ALSO
mount(2)
umount(2)
#!/usr/bin/env python
#
# mountsnoop Trace mount() and umount syscalls.
# For Linux, uses BCC, eBPF. Embedded C.
#
# USAGE: mountsnoop [-h]
#
# Copyright (c) 2016 Facebook, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 14-Oct-2016 Omar Sandoval Created this.
from __future__ import print_function
import argparse
import bcc
import ctypes
import errno
import functools
import sys
bpf_text = r"""
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
/*
* XXX: struct mnt_namespace is defined in fs/mount.h, which is private to the
* VFS and not installed in any kernel-devel packages. So, let's duplicate the
* important part of the definition. There are actually more members in the
* real struct, but we don't need them, and they're more likely to change.
*/
struct mnt_namespace {
atomic_t count;
struct ns_common ns;
};
/*
* XXX: this could really use first-class string support in BPF. target is a
* NUL-terminated path up to PATH_MAX in length. source and type are
* NUL-terminated strings up to PAGE_SIZE in length. data is a weird case: it's
* almost always a NUL-terminated string, but for some filesystems (e.g., older
* NFS variants), it's a binary structure with plenty of NUL bytes, so the
* kernel always copies up to PAGE_SIZE bytes, stopping when it hits a fault.
*
* The best we can do with the existing BPF helpers is to copy as much of each
* argument as we can. Our stack space is limited, and we need to leave some
* headroom for the rest of the function, so this should be a decent value.
*/
#define MAX_STR_LEN 412
enum event_type {
EVENT_MOUNT,
EVENT_MOUNT_SOURCE,
EVENT_MOUNT_TARGET,
EVENT_MOUNT_TYPE,
EVENT_MOUNT_DATA,
EVENT_MOUNT_RET,
EVENT_UMOUNT,
EVENT_UMOUNT_TARGET,
EVENT_UMOUNT_RET,
};
struct data_t {
enum event_type type;
pid_t pid, tgid;
union {
/* EVENT_MOUNT, EVENT_UMOUNT */
struct {
/* current->nsproxy->mnt_ns->ns.inum */
unsigned int mnt_ns;
char comm[TASK_COMM_LEN];
unsigned long flags;
} enter;
/*
* EVENT_MOUNT_SOURCE, EVENT_MOUNT_TARGET, EVENT_MOUNT_TYPE,
* EVENT_MOUNT_DATA, EVENT_UMOUNT_TARGET
*/
char str[MAX_STR_LEN];
/* EVENT_MOUNT_RET, EVENT_UMOUNT_RET */
int retval;
};
};
BPF_PERF_OUTPUT(events);
int kprobe__sys_mount(struct pt_regs *ctx, char __user *source,
char __user *target, char __user *type,
unsigned long flags)
{
/* sys_mount takes too many arguments */
char __user *data = (char __user *)PT_REGS_PARM5(ctx);
struct data_t event = {};
struct task_struct *task;
struct nsproxy *nsproxy;
struct mnt_namespace *mnt_ns;
event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
event.tgid = bpf_get_current_pid_tgid() >> 32;
event.type = EVENT_MOUNT;
bpf_get_current_comm(event.enter.comm, sizeof(event.enter.comm));
event.enter.flags = flags;
task = (struct task_struct *)bpf_get_current_task();
bpf_probe_read(&nsproxy, sizeof(nsproxy), &task->nsproxy);
bpf_probe_read(&mnt_ns, sizeof(mnt_ns), &nsproxy->mnt_ns);
bpf_probe_read(&event.enter.mnt_ns, sizeof(event.enter.mnt_ns),
&mnt_ns->ns.inum);
events.perf_submit(ctx, &event, sizeof(event));
event.type = EVENT_MOUNT_SOURCE;
memset(event.str, 0, sizeof(event.str));
bpf_probe_read(event.str, sizeof(event.str), source);
events.perf_submit(ctx, &event, sizeof(event));
event.type = EVENT_MOUNT_TARGET;
memset(event.str, 0, sizeof(event.str));
bpf_probe_read(event.str, sizeof(event.str), target);
events.perf_submit(ctx, &event, sizeof(event));
event.type = EVENT_MOUNT_TYPE;
memset(event.str, 0, sizeof(event.str));
bpf_probe_read(event.str, sizeof(event.str), type);
events.perf_submit(ctx, &event, sizeof(event));
event.type = EVENT_MOUNT_DATA;
memset(event.str, 0, sizeof(event.str));
bpf_probe_read(event.str, sizeof(event.str), data);
events.perf_submit(ctx, &event, sizeof(event));
return 0;
}
int kretprobe__sys_mount(struct pt_regs *ctx)
{
struct data_t event = {};
event.type = EVENT_MOUNT_RET;
event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
event.tgid = bpf_get_current_pid_tgid() >> 32;
event.retval = PT_REGS_RC(ctx);
events.perf_submit(ctx, &event, sizeof(event));
return 0;
}
int kprobe__sys_umount(struct pt_regs *ctx, char __user *target, int flags)
{
struct data_t event = {};
struct task_struct *task;
struct nsproxy *nsproxy;
struct mnt_namespace *mnt_ns;
event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
event.tgid = bpf_get_current_pid_tgid() >> 32;
event.type = EVENT_UMOUNT;
bpf_get_current_comm(event.enter.comm, sizeof(event.enter.comm));
event.enter.flags = flags;
task = (struct task_struct *)bpf_get_current_task();
bpf_probe_read(&nsproxy, sizeof(nsproxy), &task->nsproxy);
bpf_probe_read(&mnt_ns, sizeof(mnt_ns), &nsproxy->mnt_ns);
bpf_probe_read(&event.enter.mnt_ns, sizeof(event.enter.mnt_ns),
&mnt_ns->ns.inum);
events.perf_submit(ctx, &event, sizeof(event));
event.type = EVENT_UMOUNT_TARGET;
memset(event.str, 0, sizeof(event.str));
bpf_probe_read(event.str, sizeof(event.str), target);
events.perf_submit(ctx, &event, sizeof(event));
return 0;
}
int kretprobe__sys_umount(struct pt_regs *ctx)
{
struct data_t event = {};
event.type = EVENT_UMOUNT_RET;
event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
event.tgid = bpf_get_current_pid_tgid() >> 32;
event.retval = PT_REGS_RC(ctx);
events.perf_submit(ctx, &event, sizeof(event));
return 0;
}
"""
# sys/mount.h
MS_MGC_VAL = 0xc0ed0000
MS_MGC_MSK = 0xffff0000
MOUNT_FLAGS = [
('MS_RDONLY', 1),
('MS_NOSUID', 2),
('MS_NODEV', 4),
('MS_NOEXEC', 8),
('MS_SYNCHRONOUS', 16),
('MS_REMOUNT', 32),
('MS_MANDLOCK', 64),
('MS_DIRSYNC', 128),
('MS_NOATIME', 1024),
('MS_NODIRATIME', 2048),
('MS_BIND', 4096),
('MS_MOVE', 8192),
('MS_REC', 16384),
('MS_SILENT', 32768),
('MS_POSIXACL', 1 << 16),
('MS_UNBINDABLE', 1 << 17),
('MS_PRIVATE', 1 << 18),
('MS_SLAVE', 1 << 19),
('MS_SHARED', 1 << 20),
('MS_RELATIME', 1 << 21),
('MS_KERNMOUNT', 1 << 22),
('MS_I_VERSION', 1 << 23),
('MS_STRICTATIME', 1 << 24),
('MS_LAZYTIME', 1 << 25),
('MS_ACTIVE', 1 << 30),
('MS_NOUSER', 1 << 31),
]
UMOUNT_FLAGS = [
('MNT_FORCE', 1),
('MNT_DETACH', 2),
('MNT_EXPIRE', 4),
('UMOUNT_NOFOLLOW', 8),
]
TASK_COMM_LEN = 16 # linux/sched.h
MAX_STR_LEN = 412
class EventType(object):
EVENT_MOUNT = 0
EVENT_MOUNT_SOURCE = 1
EVENT_MOUNT_TARGET = 2
EVENT_MOUNT_TYPE = 3
EVENT_MOUNT_DATA = 4
EVENT_MOUNT_RET = 5
EVENT_UMOUNT = 6
EVENT_UMOUNT_TARGET = 7
EVENT_UMOUNT_RET = 8
class EnterData(ctypes.Structure):
_fields_ = [
('mnt_ns', ctypes.c_uint),
('comm', ctypes.c_char * TASK_COMM_LEN),
('flags', ctypes.c_ulong),
]
class DataUnion(ctypes.Union):
_fields_ = [
('enter', EnterData),
('str', ctypes.c_char * MAX_STR_LEN),
('retval', ctypes.c_int),
]
class Event(ctypes.Structure):
_fields_ = [
('type', ctypes.c_uint),
('pid', ctypes.c_uint),
('tgid', ctypes.c_uint),
('union', DataUnion),
]
def _decode_flags(flags, flag_list):
str_flags = []
for flag, bit in flag_list:
if flags & bit:
str_flags.append(flag)
flags &= ~bit
if flags or not str_flags:
str_flags.append('0x{:x}'.format(flags))
return str_flags
def decode_flags(flags, flag_list):
return '|'.join(_decode_flags(flags, flag_list))
def decode_mount_flags(flags):
str_flags = []
if flags & MS_MGC_MSK == MS_MGC_VAL:
flags &= ~MS_MGC_MSK
str_flags.append('MS_MGC_VAL')
str_flags.extend(_decode_flags(flags, MOUNT_FLAGS))
return '|'.join(str_flags)
def decode_umount_flags(flags):
return decode_flags(flags, UMOUNT_FLAGS)
def decode_errno(retval):
try:
return '-' + errno.errorcode[-retval]
except KeyError:
return str(retval)
_escape_chars = {
ord('\a'): '\\a',
ord('\b'): '\\b',
ord('\t'): '\\t',
ord('\n'): '\\n',
ord('\v'): '\\v',
ord('\f'): '\\f',
ord('\r'): '\\r',
ord('"'): '\\"',
ord('\\'): '\\\\',
}
def escape_character(c):
try:
return _escape_chars[c]
except KeyError:
if 0x20 <= c <= 0x7e:
return chr(c)
else:
return '\\x{:02x}'.format(c)
if sys.version_info.major < 3:
def decode_mount_string(s):
return '"{}"'.format(''.join(escape_character(ord(c)) for c in s))
else:
def decode_mount_string(s):
return '"{}"'.format(''.join(escape_character(c) for c in s))
def print_event(mounts, umounts, cpu, data, size):
event = ctypes.cast(data, ctypes.POINTER(Event)).contents
try:
if event.type == EventType.EVENT_MOUNT:
mounts[event.pid] = {
'pid': event.pid,
'tgid': event.tgid,
'mnt_ns': event.union.enter.mnt_ns,
'comm': event.union.enter.comm,
'flags': event.union.enter.flags,
}
elif event.type == EventType.EVENT_MOUNT_SOURCE:
mounts[event.pid]['source'] = event.union.str
elif event.type == EventType.EVENT_MOUNT_TARGET:
mounts[event.pid]['target'] = event.union.str
elif event.type == EventType.EVENT_MOUNT_TYPE:
mounts[event.pid]['type'] = event.union.str
elif event.type == EventType.EVENT_MOUNT_DATA:
# XXX: data is not always a NUL-terminated string
mounts[event.pid]['data'] = event.union.str
elif event.type == EventType.EVENT_UMOUNT:
umounts[event.pid] = {
'pid': event.pid,
'tgid': event.tgid,
'mnt_ns': event.union.enter.mnt_ns,
'comm': event.union.enter.comm,
'flags': event.union.enter.flags,
}
elif event.type == EventType.EVENT_UMOUNT_TARGET:
umounts[event.pid]['target'] = event.union.str
elif (event.type == EventType.EVENT_MOUNT_RET or
event.type == EventType.EVENT_UMOUNT_RET):
if event.type == EventType.EVENT_MOUNT_RET:
syscall = mounts.pop(event.pid)
call = 'mount({source}, {target}, {type}, {flags}, {data}) = {retval}'.format(
source=decode_mount_string(syscall['source']),
target=decode_mount_string(syscall['target']),
type=decode_mount_string(syscall['type']),
flags=decode_mount_flags(syscall['flags']),
data=decode_mount_string(syscall['data']),
retval=decode_errno(event.union.retval))
else:
syscall = umounts.pop(event.pid)
call = 'umount({target}, {flags}) = {retval}'.format(
target=decode_mount_string(syscall['target']),
flags=decode_umount_flags(syscall['flags']),
retval=decode_errno(event.union.retval))
print('{:16} {:<7} {:<7} {:<11} {}'.format(
syscall['comm'].decode(), syscall['tgid'], syscall['pid'],
syscall['mnt_ns'], call))
except KeyError:
# This might happen if we lost an event.
pass
def main():
parser = argparse.ArgumentParser(
description='trace mount() and umount() syscalls'
)
args = parser.parse_args()
mounts = {}
umounts = {}
b = bcc.BPF(text=bpf_text)
b['events'].open_perf_buffer(
functools.partial(print_event, mounts, umounts))
print('{:16} {:<7} {:<7} {:<11} {}'.format(
'COMM', 'PID', 'TID', 'MNT_NS', 'CALL'))
while True:
b.kprobe_poll()
if __name__ == '__main__':
main()
Demonstrations of mountsnoop.
mountsnoop traces the mount() and umount syscalls system-wide. For example,
running the following series of commands produces this output:
# mount --bind /mnt /mnt
# umount /mnt
# unshare -m
# mount --bind /mnt /mnt
# umount /mnt
# ./mountsnoop.py
COMM PID TID MNT_NS CALL
mount 710 710 4026531840 mount("/mnt", "/mnt", "", MS_MGC_VAL|MS_BIND, "") = 0
umount 714 714 4026531840 umount("/mnt", 0x0) = 0
unshare 717 717 4026532160 mount("none", "/", "", MS_REC|MS_PRIVATE, "") = 0
mount 725 725 4026532160 mount("/mnt", "/mnt", "", MS_MGC_VAL|MS_BIND, "") = 0
umount 728 728 4026532160 umount("/mnt", 0x0) = 0
The output shows the calling command, its process ID and thread ID, the mount
namespace the call was made in, and the call itself.
The mount namespace number is an inode number that uniquely identifies the
namespace in the running system. This can also be obtained from readlink
/proc/$PID/ns/mnt.
Note that because of restrictions in BPF, the string arguments to either
syscall may be truncated.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment