| /* Copyright (c) 2016-2017 Google Inc., All Rights Reserved. |
| * Barret Rhoden <brho@cs.berkeley.edu> |
| * Ron Minnich <rminnich@google.com> |
| * See LICENSE for details. |
| * |
| * TODO: |
| * - Do per-syscall output formatting, like interpreting arguments |
| */ |
| |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <unistd.h> |
| #include <fcntl.h> |
| #include <argp.h> |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| #include <sys/param.h> |
| #include <parlib/parlib.h> |
| #include <parlib/bitmask.h> |
| |
| struct strace_opts { |
| FILE *outfile; |
| char *trace_set; |
| char **cmd_argv; |
| int cmd_argc; |
| int pid; |
| bool follow_children; |
| bool verbose; |
| bool raw_output; |
| bool with_time; |
| bool drop_overflow; |
| }; |
| static struct strace_opts opts; |
| |
| static const char *doc = "strace -- trace syscalls of a process"; |
| static const char *args_doc = "-p PID\nPROGRAM [ARGS]\n"; |
| |
| static struct argp_option argp_opts[] = { |
| {"output", 'o', "FILE", 0, "Print output to file (default stderr)"}, |
| {"pid", 'p', "PID", 0, "Process to attach to"}, |
| {"follow", 'f', 0, 0, "Trace children"}, |
| {"verbose", 'v', 0, 0, "Print extra info, e.g. syscalls we're tracing"}, |
| {0, 0, 0, 0, ""}, |
| {"traceset", 'e', "TRACE_SET", 0, |
| "Comma-separated list of syscalls by name (e.g. openat) and sets to trace. Use '!' to negate (might need to escape the '!'), and traces are handled in order (e.g. -e path,\\!openat)\n" |
| }, |
| {" Available sets:", 0, 0, OPTION_DOC | OPTION_NO_USAGE, |
| "\n" |
| "- path: syscalls that take file paths\n" |
| "- fd: syscalls that take FDs\n" |
| "- file: path and fd sets\n" |
| "- mem: memory related (mmap, shared mem)\n" |
| "- life: process lifetime (create, fork)\n" |
| "- proc: anything process related (yields, pop_ctxs, life)\n" |
| "- sched: requests or yields to the kernel (all resources)\n" |
| "- vmm: syscalls mostly for VMs\n" |
| }, |
| {0, 0, 0, 0, ""}, |
| {"drop", 'd', 0, 0, "Drop syscalls on overflow"}, |
| {"raw", 'r', 0, 0, "Raw, untranslated output, with timestamps"}, |
| {"time", 't', 0, 0, "Print timestamps"}, |
| {0, 'h', 0, OPTION_HIDDEN, 0}, |
| { 0 } |
| }; |
| |
| struct trace_set { |
| char *name; |
| unsigned int syscs[]; |
| }; |
| |
| /* To add a trace set, create one here, add it to all_trace_sets, and update the |
| * help field in argp_opts. */ |
| |
| /* If you change this, update 'file' below. */ |
| static struct trace_set path_trace_set = { "path", |
| {SYS_proc_create, |
| SYS_exec, |
| SYS_openat, |
| SYS_stat, |
| SYS_lstat, |
| SYS_access, |
| SYS_link, |
| SYS_unlink, |
| SYS_symlink, |
| SYS_readlink, |
| SYS_chdir, |
| SYS_getcwd, |
| SYS_mkdir, |
| SYS_rmdir, |
| SYS_nbind, |
| SYS_nmount, |
| SYS_nunmount, |
| SYS_wstat, |
| SYS_rename, |
| 0} |
| }; |
| |
| /* If you change this, update 'file' below. |
| * |
| * Technically tcgetattr/tcsetattr are FDs, but it's mostly noise. This also |
| * tracks openat, since that's the source for all FDs */ |
| static struct trace_set fd_trace_set = { "fd", |
| {SYS_openat, |
| SYS_mmap, |
| SYS_read, |
| SYS_write, |
| SYS_openat, |
| SYS_close, |
| SYS_fstat, |
| SYS_fcntl, |
| SYS_llseek, |
| SYS_fchdir, |
| SYS_nmount, |
| SYS_fd2path, |
| SYS_fwstat, |
| SYS_dup_fds_to, |
| SYS_tap_fds, |
| SYS_abort_sysc_fd, |
| 0} |
| }; |
| |
| /* This is the manually-created contents of 'path' and 'fd' */ |
| static struct trace_set file_trace_set = { "file", |
| {/* From 'path' */ |
| SYS_proc_create, |
| SYS_exec, |
| SYS_openat, |
| SYS_stat, |
| SYS_lstat, |
| SYS_access, |
| SYS_link, |
| SYS_unlink, |
| SYS_symlink, |
| SYS_readlink, |
| SYS_chdir, |
| SYS_getcwd, |
| SYS_mkdir, |
| SYS_rmdir, |
| SYS_nbind, |
| SYS_nmount, |
| SYS_nunmount, |
| SYS_wstat, |
| SYS_rename, |
| SYS_openat, |
| SYS_mmap, |
| SYS_read, |
| SYS_write, |
| |
| /* From 'fd' */ |
| SYS_openat, |
| SYS_close, |
| SYS_fstat, |
| SYS_fcntl, |
| SYS_llseek, |
| SYS_fchdir, |
| SYS_nmount, |
| SYS_fd2path, |
| SYS_fwstat, |
| SYS_dup_fds_to, |
| SYS_tap_fds, |
| SYS_abort_sysc_fd, |
| 0} |
| }; |
| |
| static struct trace_set mem_trace_set = { "mem", |
| {SYS_mmap, |
| SYS_mprotect, |
| SYS_munmap, |
| SYS_shared_page_alloc, |
| SYS_shared_page_free, |
| SYS_populate_va, |
| 0} |
| }; |
| |
| /* These are all in 'proc'; keep them in sync. */ |
| static struct trace_set life_trace_set = { "life", |
| {SYS_proc_create, |
| SYS_proc_run, |
| SYS_proc_destroy, |
| SYS_fork, |
| SYS_exec, |
| SYS_waitpid, |
| SYS_change_to_m, |
| 0} |
| }; |
| |
| /* There's some misc stuff lumped in here. block/nanosleep are signs of threads |
| * sleeping on something, which is usually useful. aborting syscs are also |
| * often signs of control flow. */ |
| static struct trace_set proc_trace_set = { "proc", |
| {SYS_block, |
| SYS_nanosleep, |
| SYS_getpcoreid, |
| SYS_getvcoreid, |
| SYS_proc_yield, |
| SYS_change_vcore, |
| SYS_notify, |
| SYS_self_notify, |
| SYS_send_event, |
| SYS_vc_entry, |
| SYS_halt_core, |
| SYS_pop_ctx, |
| SYS_abort_sysc, |
| SYS_abort_sysc_fd, |
| |
| /* From 'life' */ |
| SYS_proc_create, |
| SYS_proc_run, |
| SYS_proc_destroy, |
| SYS_fork, |
| SYS_exec, |
| SYS_waitpid, |
| SYS_change_to_m, |
| 0} |
| }; |
| |
| static struct trace_set sched_trace_set = { "sched", |
| {SYS_provision, |
| SYS_proc_yield, |
| SYS_poke_ksched, |
| 0} |
| }; |
| |
| static struct trace_set vmm_trace_set = { "vmm", |
| {SYS_vmm_add_gpcs, |
| SYS_vmm_poke_guest, |
| SYS_vmm_ctl, |
| SYS_pop_ctx, |
| 0} |
| }; |
| |
| static struct trace_set *all_trace_sets[] = { |
| &path_trace_set, |
| &fd_trace_set, |
| &file_trace_set, |
| &mem_trace_set, |
| &life_trace_set, |
| &proc_trace_set, |
| &sched_trace_set, |
| &vmm_trace_set, |
| }; |
| |
| static DECL_BITMASK(traceset_bm, MAX_SYSCALL_NR); |
| |
| |
| static error_t parse_strace_opt(int key, char *arg, struct argp_state *state) |
| { |
| struct strace_opts *s_opts = state->input; |
| |
| switch (key) { |
| case 'o': |
| s_opts->outfile = fopen(arg, "wb"); |
| if (!s_opts->outfile) { |
| fprintf(stderr, |
| "Unable to open file '%s' for writing: %s\n", |
| arg, strerror(errno)); |
| exit(1); |
| } |
| break; |
| case 'e': |
| s_opts->trace_set = arg; |
| break; |
| case 'p': |
| s_opts->pid = atoi(arg); |
| if (!s_opts->pid) |
| argp_error(state, "Cannot trace pid 0 (won't exist)"); |
| break; |
| case 'f': |
| s_opts->follow_children = TRUE; |
| break; |
| case 'v': |
| s_opts->verbose = TRUE; |
| break; |
| case 'r': |
| s_opts->raw_output = TRUE; |
| break; |
| case 't': |
| s_opts->with_time = TRUE; |
| break; |
| case 'd': |
| s_opts->drop_overflow = TRUE; |
| break; |
| case ARGP_KEY_ARG: |
| if (s_opts->pid) |
| argp_error(state, "PID already set, can't launch a process too"); |
| s_opts->cmd_argc = state->argc - state->next + 1; |
| s_opts->cmd_argv = malloc(sizeof(char*) * (s_opts->cmd_argc |
| + 1)); |
| assert(s_opts->cmd_argv); |
| s_opts->cmd_argv[0] = arg; |
| memcpy(&s_opts->cmd_argv[1], &state->argv[state->next], |
| sizeof(char*) * (s_opts->cmd_argc - 1)); |
| s_opts->cmd_argv[s_opts->cmd_argc] = NULL; |
| state->next = state->argc; |
| break; |
| case ARGP_KEY_END: |
| if (!(s_opts->cmd_argc || s_opts->pid)) |
| argp_error(state, "Need either -p or a command to run"); |
| /* Note we never fclose outfile. It'll flush when we exit. |
| * o/w, we'll need to be careful whether we're closing stderr or |
| * not. */ |
| if (!s_opts->outfile) |
| s_opts->outfile = stderr; |
| break; |
| case 'h': |
| argp_usage(state); |
| break; |
| default: |
| return ARGP_ERR_UNKNOWN; |
| } |
| return 0; |
| } |
| |
| static bool handle_trace_set(char *tok, bool clear) |
| { |
| struct trace_set *ts; |
| unsigned int sysc_nr; |
| |
| for (int i = 0; i < COUNT_OF(all_trace_sets); i++) { |
| ts = all_trace_sets[i]; |
| if (!strcmp(ts->name, tok)) { |
| for (int j = 0; j < MAX_SYSCALL_NR; j++) { |
| sysc_nr = ts->syscs[j]; |
| /* 0-terminated list */ |
| if (!sysc_nr) |
| break; |
| if (clear) |
| CLR_BITMASK_BIT(traceset_bm, sysc_nr); |
| else |
| SET_BITMASK_BIT(traceset_bm, sysc_nr); |
| } |
| return TRUE; |
| } |
| } |
| return FALSE; |
| } |
| |
| static char *resolve_syscall_alias(char *tok) |
| { |
| if (!strcmp(tok, "open")) { |
| tok = "openat"; |
| return tok; |
| } |
| return tok; |
| } |
| |
| static bool handle_raw_syscall(char *tok, bool clear) |
| { |
| for (int i = 0; i < __syscall_tbl_sz; i++) { |
| if (!__syscall_tbl[i]) |
| continue; |
| tok = resolve_syscall_alias(tok); |
| if (!strcmp(__syscall_tbl[i], tok)) { |
| if (clear) |
| CLR_BITMASK_BIT(traceset_bm, i); |
| else |
| SET_BITMASK_BIT(traceset_bm, i); |
| return TRUE; |
| } |
| } |
| return FALSE; |
| } |
| |
| static void build_ignore_list(char *trace_set) |
| { |
| char *tok, *tok_save = 0; |
| bool clear = FALSE; |
| |
| if (!trace_set) { |
| if (opts.verbose) |
| fprintf(stderr, "# Tracing all syscalls\n"); |
| return; |
| } |
| for (tok = strtok_r(trace_set, ",", &tok_save); |
| tok; |
| tok = strtok_r(NULL, ",", &tok_save)) { |
| |
| if (tok[0] == '!') { |
| clear = TRUE; |
| tok++; |
| } |
| if (handle_trace_set(tok, clear)) |
| continue; |
| if (handle_raw_syscall(tok, clear)) |
| continue; |
| /* You could imaging continuing, but this error would probably |
| * be missed in the output stream and we'd be wondering why we |
| * weren't getting a syscall, due to a typo. */ |
| fprintf(stderr, "Unknown trace_set argument %s, aborting!\n", |
| tok); |
| exit(-1); |
| } |
| if (opts.verbose) { |
| for (int i = 0; i < MAX_SYSCALL_NR; i++) { |
| if (GET_BITMASK_BIT(traceset_bm, i)) |
| fprintf(stderr, "# Tracing syscall %s (%d)\n", |
| __syscall_tbl[i], i); |
| } |
| } |
| } |
| |
| /* Removes the timestamp part of the line. Use the return string in place of |
| * the full line you pass in. */ |
| static char *remove_timestamps(char *full_line) |
| { |
| char *close_brace; |
| |
| /* Format: E [ 13655.986589401]-[ 0.000000000] Syscall. |
| * The seconds field may vary in size, so we need to find the second |
| * ']'. */ |
| close_brace = strchr(full_line, ']'); |
| if (!close_brace) |
| return full_line; |
| close_brace = strchr(close_brace + 1, ']'); |
| if (!close_brace) |
| return full_line; |
| /* move starting E or X marker */ |
| *close_brace = full_line[0]; |
| return close_brace; |
| } |
| |
| static void parse_traces(int fd) |
| { |
| char *line, *_line; |
| ssize_t ret; |
| |
| line = malloc(SYSTR_BUF_SZ); |
| assert(line); |
| |
| while ((ret = read(fd, line, SYSTR_BUF_SZ)) > 0) { |
| /* make sure each line ends in \n\0. */ |
| line[ret - 1] = '\n'; |
| line[MIN(ret, SYSTR_BUF_SZ - 1)] = 0; |
| _line = line; |
| if (opts.raw_output) { |
| fprintf(opts.outfile, "%s", _line); |
| continue; |
| } |
| if (!opts.with_time) |
| _line = remove_timestamps(_line); |
| fprintf(opts.outfile, "%s", _line); |
| } |
| /* This is a little hokey. If the process exited, then the qio hung up |
| * and we got a status message from the kernel. This was the errstr of |
| * the last failed read. However, if we're doing a -p and someone kills |
| * *us*, we'll never see this. And catching the signal doesn't help |
| * either. The process needs to exit (strace_shutdown). Either that, |
| * or change the kernel to set_errstr() on close(), and coordinate with |
| * a sighandler. */ |
| if (opts.verbose) |
| fprintf(stderr, "%r\n"); |
| free(line); |
| } |
| |
| int main(int argc, char **argv, char **envp) |
| { |
| int fd; |
| pid_t pid; |
| static char path[2 * MAX_PATH_LEN]; |
| struct syscall sysc; |
| struct argp argp = {argp_opts, parse_strace_opt, args_doc, doc}; |
| |
| argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, &opts); |
| |
| build_ignore_list(opts.trace_set); |
| |
| if (opts.cmd_argc) { |
| pid = create_child_with_stdfds(opts.cmd_argv[0], opts.cmd_argc, |
| opts.cmd_argv, envp); |
| if (pid < 0) { |
| perror("Unable to spawn child"); |
| exit(-1); |
| } |
| /* We need to wait on the child asynchronously. If we hold a |
| * ref (as the parent), the child won't proc_free and that won't |
| * hangup/wake us from a read. */ |
| syscall_async(&sysc, SYS_waitpid, pid, NULL, 0, 0, 0, 0); |
| } else { |
| pid = opts.pid; |
| } |
| |
| snprintf(path, sizeof(path), "/proc/%d/ctl", pid); |
| fd = open(path, O_WRITE); |
| if (fd < 0) { |
| fprintf(stderr, "open %s: %r\n", path); |
| exit(1); |
| } |
| if (opts.follow_children) |
| snprintf(path, sizeof(path), "straceall"); |
| else |
| snprintf(path, sizeof(path), "straceme"); |
| if (write(fd, path, strlen(path)) < strlen(path)) { |
| fprintf(stderr, "write to ctl %s: %r\n", path); |
| exit(1); |
| } |
| if (opts.drop_overflow) { |
| snprintf(path, sizeof(path), "strace_drop on"); |
| if (write(fd, path, strlen(path)) < strlen(path)) { |
| fprintf(stderr, "write to ctl %s: %r\n", path); |
| exit(1); |
| } |
| } |
| close(fd); |
| |
| if (opts.trace_set) { |
| snprintf(path, sizeof(path), "/proc/%d/strace_traceset", pid); |
| fd = open(path, O_RDWR); |
| if (fd < 0) { |
| fprintf(stderr, "open %s: %r\n", path); |
| exit(1); |
| } |
| if (write(fd, traceset_bm, COUNT_OF(traceset_bm)) |
| < COUNT_OF(traceset_bm)) { |
| fprintf(stderr, "write to strace_ignore: %r\n"); |
| exit(1); |
| } |
| close(fd); |
| } |
| |
| snprintf(path, sizeof(path), "/proc/%d/strace", pid); |
| fd = open(path, O_READ); |
| if (!fd) { |
| fprintf(stderr, "open %s: %r\n", path); |
| exit(1); |
| } |
| |
| if (opts.cmd_argc) { |
| /* now that we've set up the tracing, we can run the process. |
| * isn't it great that the process doesn't immediately start |
| * when you make it? */ |
| sys_proc_run(pid); |
| } |
| |
| parse_traces(fd); |
| return 0; |
| } |