blob: 576bb21162b941198fec7a33edf1b3852b9fde9d [file] [log] [blame]
/* Copyright (c) 2016-2017 Google Inc., All Rights Reserved.
* Barret Rhoden <brho@cs.berkeley.edu>
* Ron Minnich <rminnich@google.com>
* See LICENSE for details.
*
* TODO:
* - Do per-syscall output formatting, like interpreting arguments
*/
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <argp.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/param.h>
#include <parlib/parlib.h>
#include <parlib/bitmask.h>
struct strace_opts {
FILE *outfile;
char *trace_set;
char **cmd_argv;
int cmd_argc;
int pid;
bool follow_children;
bool verbose;
bool raw_output;
bool with_time;
bool drop_overflow;
};
static struct strace_opts opts;
static const char *doc = "strace -- trace syscalls of a process";
static const char *args_doc = "-p PID\nPROGRAM [ARGS]\n";
static struct argp_option argp_opts[] = {
{"output", 'o', "FILE", 0, "Print output to file (default stderr)"},
{"pid", 'p', "PID", 0, "Process to attach to"},
{"follow", 'f', 0, 0, "Trace children"},
{"verbose", 'v', 0, 0, "Print extra info, e.g. syscalls we're tracing"},
{0, 0, 0, 0, ""},
{"traceset", 'e', "TRACE_SET", 0,
"Comma-separated list of syscalls by name (e.g. openat) and sets to trace. Use '!' to negate (might need to escape the '!'), and traces are handled in order (e.g. -e path,\\!openat)\n"
},
{" Available sets:", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
"\n"
"- path: syscalls that take file paths\n"
"- fd: syscalls that take FDs\n"
"- file: path and fd sets\n"
"- mem: memory related (mmap, shared mem)\n"
"- life: process lifetime (create, fork)\n"
"- proc: anything process related (yields, pop_ctxs, life)\n"
"- sched: requests or yields to the kernel (all resources)\n"
"- vmm: syscalls mostly for VMs\n"
},
{0, 0, 0, 0, ""},
{"drop", 'd', 0, 0, "Drop syscalls on overflow"},
{"raw", 'r', 0, 0, "Raw, untranslated output, with timestamps"},
{"time", 't', 0, 0, "Print timestamps"},
{0, 'h', 0, OPTION_HIDDEN, 0},
{ 0 }
};
struct trace_set {
char *name;
unsigned int syscs[];
};
/* To add a trace set, create one here, add it to all_trace_sets, and update the
* help field in argp_opts. */
/* If you change this, update 'file' below. */
static struct trace_set path_trace_set = { "path",
{SYS_proc_create,
SYS_exec,
SYS_openat,
SYS_stat,
SYS_lstat,
SYS_access,
SYS_link,
SYS_unlink,
SYS_symlink,
SYS_readlink,
SYS_chdir,
SYS_getcwd,
SYS_mkdir,
SYS_rmdir,
SYS_nbind,
SYS_nmount,
SYS_nunmount,
SYS_wstat,
SYS_rename,
0}
};
/* If you change this, update 'file' below.
*
* Technically tcgetattr/tcsetattr are FDs, but it's mostly noise. This also
* tracks openat, since that's the source for all FDs */
static struct trace_set fd_trace_set = { "fd",
{SYS_openat,
SYS_mmap,
SYS_read,
SYS_write,
SYS_openat,
SYS_close,
SYS_fstat,
SYS_fcntl,
SYS_llseek,
SYS_fchdir,
SYS_nmount,
SYS_fd2path,
SYS_fwstat,
SYS_dup_fds_to,
SYS_tap_fds,
SYS_abort_sysc_fd,
0}
};
/* This is the manually-created contents of 'path' and 'fd' */
static struct trace_set file_trace_set = { "file",
{/* From 'path' */
SYS_proc_create,
SYS_exec,
SYS_openat,
SYS_stat,
SYS_lstat,
SYS_access,
SYS_link,
SYS_unlink,
SYS_symlink,
SYS_readlink,
SYS_chdir,
SYS_getcwd,
SYS_mkdir,
SYS_rmdir,
SYS_nbind,
SYS_nmount,
SYS_nunmount,
SYS_wstat,
SYS_rename,
SYS_openat,
SYS_mmap,
SYS_read,
SYS_write,
/* From 'fd' */
SYS_openat,
SYS_close,
SYS_fstat,
SYS_fcntl,
SYS_llseek,
SYS_fchdir,
SYS_nmount,
SYS_fd2path,
SYS_fwstat,
SYS_dup_fds_to,
SYS_tap_fds,
SYS_abort_sysc_fd,
0}
};
static struct trace_set mem_trace_set = { "mem",
{SYS_mmap,
SYS_mprotect,
SYS_munmap,
SYS_shared_page_alloc,
SYS_shared_page_free,
SYS_populate_va,
0}
};
/* These are all in 'proc'; keep them in sync. */
static struct trace_set life_trace_set = { "life",
{SYS_proc_create,
SYS_proc_run,
SYS_proc_destroy,
SYS_fork,
SYS_exec,
SYS_waitpid,
SYS_change_to_m,
0}
};
/* There's some misc stuff lumped in here. block/nanosleep are signs of threads
* sleeping on something, which is usually useful. aborting syscs are also
* often signs of control flow. */
static struct trace_set proc_trace_set = { "proc",
{SYS_block,
SYS_nanosleep,
SYS_getpcoreid,
SYS_getvcoreid,
SYS_proc_yield,
SYS_change_vcore,
SYS_notify,
SYS_self_notify,
SYS_send_event,
SYS_vc_entry,
SYS_halt_core,
SYS_pop_ctx,
SYS_abort_sysc,
SYS_abort_sysc_fd,
/* From 'life' */
SYS_proc_create,
SYS_proc_run,
SYS_proc_destroy,
SYS_fork,
SYS_exec,
SYS_waitpid,
SYS_change_to_m,
0}
};
static struct trace_set sched_trace_set = { "sched",
{SYS_provision,
SYS_proc_yield,
SYS_poke_ksched,
0}
};
static struct trace_set vmm_trace_set = { "vmm",
{SYS_vmm_add_gpcs,
SYS_vmm_poke_guest,
SYS_vmm_ctl,
SYS_pop_ctx,
0}
};
static struct trace_set *all_trace_sets[] = {
&path_trace_set,
&fd_trace_set,
&file_trace_set,
&mem_trace_set,
&life_trace_set,
&proc_trace_set,
&sched_trace_set,
&vmm_trace_set,
};
static DECL_BITMASK(traceset_bm, MAX_SYSCALL_NR);
static error_t parse_strace_opt(int key, char *arg, struct argp_state *state)
{
struct strace_opts *s_opts = state->input;
switch (key) {
case 'o':
s_opts->outfile = fopen(arg, "wb");
if (!s_opts->outfile) {
fprintf(stderr,
"Unable to open file '%s' for writing: %s\n",
arg, strerror(errno));
exit(1);
}
break;
case 'e':
s_opts->trace_set = arg;
break;
case 'p':
s_opts->pid = atoi(arg);
if (!s_opts->pid)
argp_error(state, "Cannot trace pid 0 (won't exist)");
break;
case 'f':
s_opts->follow_children = TRUE;
break;
case 'v':
s_opts->verbose = TRUE;
break;
case 'r':
s_opts->raw_output = TRUE;
break;
case 't':
s_opts->with_time = TRUE;
break;
case 'd':
s_opts->drop_overflow = TRUE;
break;
case ARGP_KEY_ARG:
if (s_opts->pid)
argp_error(state, "PID already set, can't launch a process too");
s_opts->cmd_argc = state->argc - state->next + 1;
s_opts->cmd_argv = malloc(sizeof(char*) * (s_opts->cmd_argc
+ 1));
assert(s_opts->cmd_argv);
s_opts->cmd_argv[0] = arg;
memcpy(&s_opts->cmd_argv[1], &state->argv[state->next],
sizeof(char*) * (s_opts->cmd_argc - 1));
s_opts->cmd_argv[s_opts->cmd_argc] = NULL;
state->next = state->argc;
break;
case ARGP_KEY_END:
if (!(s_opts->cmd_argc || s_opts->pid))
argp_error(state, "Need either -p or a command to run");
/* Note we never fclose outfile. It'll flush when we exit.
* o/w, we'll need to be careful whether we're closing stderr or
* not. */
if (!s_opts->outfile)
s_opts->outfile = stderr;
break;
case 'h':
argp_usage(state);
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
static bool handle_trace_set(char *tok, bool clear)
{
struct trace_set *ts;
unsigned int sysc_nr;
for (int i = 0; i < COUNT_OF(all_trace_sets); i++) {
ts = all_trace_sets[i];
if (!strcmp(ts->name, tok)) {
for (int j = 0; j < MAX_SYSCALL_NR; j++) {
sysc_nr = ts->syscs[j];
/* 0-terminated list */
if (!sysc_nr)
break;
if (clear)
CLR_BITMASK_BIT(traceset_bm, sysc_nr);
else
SET_BITMASK_BIT(traceset_bm, sysc_nr);
}
return TRUE;
}
}
return FALSE;
}
static char *resolve_syscall_alias(char *tok)
{
if (!strcmp(tok, "open")) {
tok = "openat";
return tok;
}
return tok;
}
static bool handle_raw_syscall(char *tok, bool clear)
{
for (int i = 0; i < __syscall_tbl_sz; i++) {
if (!__syscall_tbl[i])
continue;
tok = resolve_syscall_alias(tok);
if (!strcmp(__syscall_tbl[i], tok)) {
if (clear)
CLR_BITMASK_BIT(traceset_bm, i);
else
SET_BITMASK_BIT(traceset_bm, i);
return TRUE;
}
}
return FALSE;
}
static void build_ignore_list(char *trace_set)
{
char *tok, *tok_save = 0;
bool clear = FALSE;
if (!trace_set) {
if (opts.verbose)
fprintf(stderr, "# Tracing all syscalls\n");
return;
}
for (tok = strtok_r(trace_set, ",", &tok_save);
tok;
tok = strtok_r(NULL, ",", &tok_save)) {
if (tok[0] == '!') {
clear = TRUE;
tok++;
}
if (handle_trace_set(tok, clear))
continue;
if (handle_raw_syscall(tok, clear))
continue;
/* You could imaging continuing, but this error would probably
* be missed in the output stream and we'd be wondering why we
* weren't getting a syscall, due to a typo. */
fprintf(stderr, "Unknown trace_set argument %s, aborting!\n",
tok);
exit(-1);
}
if (opts.verbose) {
for (int i = 0; i < MAX_SYSCALL_NR; i++) {
if (GET_BITMASK_BIT(traceset_bm, i))
fprintf(stderr, "# Tracing syscall %s (%d)\n",
__syscall_tbl[i], i);
}
}
}
/* Removes the timestamp part of the line. Use the return string in place of
* the full line you pass in. */
static char *remove_timestamps(char *full_line)
{
char *close_brace;
/* Format: E [ 13655.986589401]-[ 0.000000000] Syscall.
* The seconds field may vary in size, so we need to find the second
* ']'. */
close_brace = strchr(full_line, ']');
if (!close_brace)
return full_line;
close_brace = strchr(close_brace + 1, ']');
if (!close_brace)
return full_line;
/* move starting E or X marker */
*close_brace = full_line[0];
return close_brace;
}
static void parse_traces(int fd)
{
char *line, *_line;
ssize_t ret;
line = malloc(SYSTR_BUF_SZ);
assert(line);
while ((ret = read(fd, line, SYSTR_BUF_SZ)) > 0) {
/* make sure each line ends in \n\0. */
line[ret - 1] = '\n';
line[MIN(ret, SYSTR_BUF_SZ - 1)] = 0;
_line = line;
if (opts.raw_output) {
fprintf(opts.outfile, "%s", _line);
continue;
}
if (!opts.with_time)
_line = remove_timestamps(_line);
fprintf(opts.outfile, "%s", _line);
}
/* This is a little hokey. If the process exited, then the qio hung up
* and we got a status message from the kernel. This was the errstr of
* the last failed read. However, if we're doing a -p and someone kills
* *us*, we'll never see this. And catching the signal doesn't help
* either. The process needs to exit (strace_shutdown). Either that,
* or change the kernel to set_errstr() on close(), and coordinate with
* a sighandler. */
if (opts.verbose)
fprintf(stderr, "%r\n");
free(line);
}
int main(int argc, char **argv, char **envp)
{
int fd;
pid_t pid;
static char path[2 * MAX_PATH_LEN];
struct syscall sysc;
struct argp argp = {argp_opts, parse_strace_opt, args_doc, doc};
argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, &opts);
build_ignore_list(opts.trace_set);
if (opts.cmd_argc) {
pid = create_child_with_stdfds(opts.cmd_argv[0], opts.cmd_argc,
opts.cmd_argv, envp);
if (pid < 0) {
perror("Unable to spawn child");
exit(-1);
}
/* We need to wait on the child asynchronously. If we hold a
* ref (as the parent), the child won't proc_free and that won't
* hangup/wake us from a read. */
syscall_async(&sysc, SYS_waitpid, pid, NULL, 0, 0, 0, 0);
} else {
pid = opts.pid;
}
snprintf(path, sizeof(path), "/proc/%d/ctl", pid);
fd = open(path, O_WRITE);
if (fd < 0) {
fprintf(stderr, "open %s: %r\n", path);
exit(1);
}
if (opts.follow_children)
snprintf(path, sizeof(path), "straceall");
else
snprintf(path, sizeof(path), "straceme");
if (write(fd, path, strlen(path)) < strlen(path)) {
fprintf(stderr, "write to ctl %s: %r\n", path);
exit(1);
}
if (opts.drop_overflow) {
snprintf(path, sizeof(path), "strace_drop on");
if (write(fd, path, strlen(path)) < strlen(path)) {
fprintf(stderr, "write to ctl %s: %r\n", path);
exit(1);
}
}
close(fd);
if (opts.trace_set) {
snprintf(path, sizeof(path), "/proc/%d/strace_traceset", pid);
fd = open(path, O_RDWR);
if (fd < 0) {
fprintf(stderr, "open %s: %r\n", path);
exit(1);
}
if (write(fd, traceset_bm, COUNT_OF(traceset_bm))
< COUNT_OF(traceset_bm)) {
fprintf(stderr, "write to strace_ignore: %r\n");
exit(1);
}
close(fd);
}
snprintf(path, sizeof(path), "/proc/%d/strace", pid);
fd = open(path, O_READ);
if (!fd) {
fprintf(stderr, "open %s: %r\n", path);
exit(1);
}
if (opts.cmd_argc) {
/* now that we've set up the tracing, we can run the process.
* isn't it great that the process doesn't immediately start
* when you make it? */
sys_proc_run(pid);
}
parse_traces(fd);
return 0;
}