/* Copyright (c) 2016-2017 Google Inc., All Rights Reserved.
 * Barret Rhoden <brho@cs.berkeley.edu>
 * Ron Minnich <rminnich@google.com>
 * See LICENSE for details.
 *
 * TODO:
 * - Do per-syscall output formatting, like interpreting arguments
 */

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <argp.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/param.h>
#include <parlib/parlib.h>
#include <parlib/bitmask.h>

struct strace_opts {
	FILE						*outfile;
	char						*trace_set;
	char						**cmd_argv;
	int							cmd_argc;
	int							pid;
	bool						follow_children;
	bool						verbose;
	bool						raw_output;
	bool						with_time;
	bool						drop_overflow;
};
static struct strace_opts opts;

static const char *doc = "strace -- trace syscalls of a process";
static const char *args_doc = "-p PID\nPROGRAM [ARGS]\n";

static struct argp_option argp_opts[] = {
	{"output", 'o', "FILE", 0, "Print output to file (default stderr)"},
	{"pid", 'p', "PID", 0, "Process to attach to"},
	{"follow", 'f', 0, 0, "Trace children"},
	{"verbose", 'v', 0, 0, "Print extra info, e.g. syscalls we're tracing"},
	{0, 0, 0, 0, ""},
	{"traceset", 'e', "TRACE_SET", 0,
	 "Comma-separated list of syscalls by name (e.g. openat) and sets to trace.  Use '!' to negate (might need to escape the '!'), and traces are handled in order (e.g. -e path,\\!openat)\n"
	 },
	{"    Available sets:", 0, 0, OPTION_DOC | OPTION_NO_USAGE,
	                "\n"
	                "- path: syscalls that take file paths\n"
	                "- fd: syscalls that take FDs\n"
	                "- file: path and fd sets\n"
	                "- mem: memory related (mmap, shared mem)\n"
	                "- life: process lifetime (create, fork)\n"
	                "- proc: anything process related (yields, pop_ctxs, life)\n"
	                "- sched: requests or yields to the kernel (all resources)\n"
	                "- vmm: syscalls mostly for VMs\n"
	},
	{0, 0, 0, 0, ""},
	{"drop", 'd', 0, 0, "Drop syscalls on overflow"},
	{"raw", 'r', 0, 0, "Raw, untranslated output, with timestamps"},
	{"time", 't', 0, 0, "Print timestamps"},
	{0, 'h', 0, OPTION_HIDDEN, 0},
	{ 0 }
};

struct trace_set {
	char						*name;
	unsigned int				syscs[];
};

/* To add a trace set, create one here, add it to all_trace_sets, and update the
 * help field in argp_opts. */

/* If you change this, update 'file' below. */
static struct trace_set path_trace_set = { "path",
	{SYS_proc_create,
	 SYS_exec,
	 SYS_openat,
	 SYS_stat,
	 SYS_lstat,
	 SYS_access,
	 SYS_link,
	 SYS_unlink,
	 SYS_symlink,
	 SYS_readlink,
	 SYS_chdir,
	 SYS_mkdir,
	 SYS_rmdir,
	 SYS_nbind,
	 SYS_nmount,
	 SYS_nunmount,
	 SYS_wstat,
	 SYS_rename,
	 0}
};

/* If you change this, update 'file' below.
 *
 * Technically tcgetattr/tcsetattr are FDs, but it's mostly noise.  This also
 * tracks openat, since that's the source for all FDs */
static struct trace_set fd_trace_set = { "fd",
	{SYS_openat,
	 SYS_mmap,
	 SYS_read,
	 SYS_write,
	 SYS_openat,
	 SYS_close,
	 SYS_fstat,
	 SYS_fcntl,
	 SYS_llseek,
	 SYS_fchdir,
	 SYS_nmount,
	 SYS_fd2path,
	 SYS_fwstat,
	 SYS_dup_fds_to,
	 SYS_tap_fds,
	 SYS_abort_sysc_fd,
	 0}
};

/* This is the manually-created contents of 'path' and 'fd' */
static struct trace_set file_trace_set = { "file",
	{/* From 'path' */
	 SYS_proc_create,
	 SYS_exec,
	 SYS_openat,
	 SYS_stat,
	 SYS_lstat,
	 SYS_access,
	 SYS_link,
	 SYS_unlink,
	 SYS_symlink,
	 SYS_readlink,
	 SYS_chdir,
	 SYS_mkdir,
	 SYS_rmdir,
	 SYS_nbind,
	 SYS_nmount,
	 SYS_nunmount,
	 SYS_wstat,
	 SYS_rename,
	 SYS_openat,
	 SYS_mmap,
	 SYS_read,
	 SYS_write,

	 /* From 'fd' */
	 SYS_openat,
	 SYS_close,
	 SYS_fstat,
	 SYS_fcntl,
	 SYS_llseek,
	 SYS_fchdir,
	 SYS_nmount,
	 SYS_fd2path,
	 SYS_fwstat,
	 SYS_dup_fds_to,
	 SYS_tap_fds,
	 SYS_abort_sysc_fd,
	 0}
};

static struct trace_set mem_trace_set = { "mem",
	{SYS_mmap,
	 SYS_mprotect,
	 SYS_munmap,
	 SYS_shared_page_alloc,
	 SYS_shared_page_free,
	 SYS_populate_va,
	 0}
};

/* These are all in 'proc'; keep them in sync. */
static struct trace_set life_trace_set = { "life",
	{SYS_proc_create,
	 SYS_proc_run,
	 SYS_proc_destroy,
	 SYS_fork,
	 SYS_exec,
	 SYS_waitpid,
	 SYS_change_to_m,
	 0}
};

/* There's some misc stuff lumped in here.  block/nanosleep are signs of threads
 * sleeping on something, which is usually useful.  aborting syscs are also
 * often signs of control flow. */
static struct trace_set proc_trace_set = { "proc",
	{SYS_block,
	 SYS_nanosleep,
	 SYS_getpcoreid,
	 SYS_getvcoreid,
	 SYS_proc_yield,
	 SYS_change_vcore,
	 SYS_notify,
	 SYS_self_notify,
	 SYS_send_event,
	 SYS_vc_entry,
	 SYS_halt_core,
	 SYS_pop_ctx,
	 SYS_abort_sysc,
	 SYS_abort_sysc_fd,

	 /* From 'life' */
	 SYS_proc_create,
	 SYS_proc_run,
	 SYS_proc_destroy,
	 SYS_fork,
	 SYS_exec,
	 SYS_waitpid,
	 SYS_change_to_m,
	 0}
};

static struct trace_set sched_trace_set = { "sched",
	{SYS_provision,
	 SYS_proc_yield,
	 SYS_poke_ksched,
	 0}
};

static struct trace_set vmm_trace_set = { "vmm",
	{SYS_vmm_setup,
	 SYS_vmm_poke_guest,
	 SYS_pop_ctx,
	 0}
};

static struct trace_set *all_trace_sets[] = {
	&path_trace_set,
	&fd_trace_set,
	&file_trace_set,
	&mem_trace_set,
	&life_trace_set,
	&proc_trace_set,
	&sched_trace_set,
	&vmm_trace_set,
};

static DECL_BITMASK(traceset_bm, MAX_SYSCALL_NR);


static error_t parse_strace_opt(int key, char *arg, struct argp_state *state)
{
	struct strace_opts *s_opts = state->input;

	switch (key) {
	case 'o':
		s_opts->outfile = fopen(arg, "wb");
		if (!s_opts->outfile) {
			fprintf(stderr, "Unable to open file '%s' for writing: %s\n",
					arg, strerror(errno));
			exit(1);
		}
		break;
	case 'e':
		s_opts->trace_set = arg;
		break;
	case 'p':
		s_opts->pid = atoi(arg);
		if (!s_opts->pid)
			argp_error(state, "Cannot trace pid 0 (won't exist)");
		break;
	case 'f':
		s_opts->follow_children = TRUE;
		break;
	case 'v':
		s_opts->verbose = TRUE;
		break;
	case 'r':
		s_opts->raw_output = TRUE;
		break;
	case 't':
		s_opts->with_time = TRUE;
		break;
	case 'd':
		s_opts->drop_overflow = TRUE;
		break;
	case ARGP_KEY_ARG:
		if (s_opts->pid)
			argp_error(state, "PID already set, can't launch a process too");
		s_opts->cmd_argc = state->argc - state->next + 1;
		s_opts->cmd_argv = malloc(sizeof(char*) * (s_opts->cmd_argc + 1));
		assert(s_opts->cmd_argv);
		s_opts->cmd_argv[0] = arg;
		memcpy(&s_opts->cmd_argv[1], &state->argv[state->next],
		       sizeof(char*) * (s_opts->cmd_argc - 1));
		s_opts->cmd_argv[s_opts->cmd_argc] = NULL;
		state->next = state->argc;
		break;
	case ARGP_KEY_END:
		if (!(s_opts->cmd_argc || s_opts->pid))
			argp_error(state, "Need either -p or a command to run");
		/* Note we never fclose outfile.  It'll flush when we exit.  o/w, we'll
		 * need to be careful whether we're closing stderr or not. */
		if (!s_opts->outfile)
			s_opts->outfile = stderr;
		break;
	case 'h':
		argp_usage(state);
		break;
	default:
		return ARGP_ERR_UNKNOWN;
	}
	return 0;
}

static bool handle_trace_set(char *tok, bool clear)
{
	struct trace_set *ts;
	unsigned int sysc_nr;

	for (int i = 0; i < COUNT_OF(all_trace_sets); i++) {
		ts = all_trace_sets[i];
		if (!strcmp(ts->name, tok)) {
			for (int j = 0; j < MAX_SYSCALL_NR; j++) {
				sysc_nr = ts->syscs[j];
				/* 0-terminated list */
				if (!sysc_nr)
					break;
				if (clear)
					CLR_BITMASK_BIT(traceset_bm, sysc_nr);
				else
					SET_BITMASK_BIT(traceset_bm, sysc_nr);
			}
			return TRUE;
		}
	}
	return FALSE;
}

static char *resolve_syscall_alias(char *tok)
{
	if (!strcmp(tok, "open")) {
		tok = "openat";
		return tok;
	}
	return tok;
}

static bool handle_raw_syscall(char *tok, bool clear)
{
	for (int i = 0; i < __syscall_tbl_sz; i++) {
		if (!__syscall_tbl[i])
			continue;
		tok = resolve_syscall_alias(tok);
		if (!strcmp(__syscall_tbl[i], tok)) {
			if (clear)
				CLR_BITMASK_BIT(traceset_bm, i);
			else
				SET_BITMASK_BIT(traceset_bm, i);
			return TRUE;
		}
	}
	return FALSE;
}

static void build_ignore_list(char *trace_set)
{
	char *tok, *tok_save = 0;
	bool clear = FALSE;

	if (!trace_set) {
		if (opts.verbose)
			fprintf(stderr, "# Tracing all syscalls\n");
		return;
	}
	for (tok = strtok_r(trace_set, ",", &tok_save);
	     tok;
		 tok = strtok_r(NULL, ",", &tok_save)) {

		if (tok[0] == '!') {
			clear = TRUE;
			tok++;
		}
		if (handle_trace_set(tok, clear))
			continue;
		if (handle_raw_syscall(tok, clear))
			continue;
		/* You could imaging continuing, but this error would probably be
		 * missed in the output stream and we'd be wondering why we weren't
		 * getting a syscall, due to a typo. */
		fprintf(stderr, "Unknown trace_set argument %s, aborting!\n",
		        tok);
		exit(-1);
	}
	if (opts.verbose) {
		for (int i = 0; i < MAX_SYSCALL_NR; i++) {
			if (GET_BITMASK_BIT(traceset_bm, i))
				fprintf(stderr, "# Tracing syscall %s (%d)\n",
				        __syscall_tbl[i], i);
		}
	}
}

static void parse_traces(int fd)
{
	char *line, *_line;
	ssize_t ret;

	line = malloc(SYSTR_BUF_SZ);
	assert(line);

	while ((ret = read(fd, line, SYSTR_BUF_SZ)) > 0) {
		/* make sure each line ends in \n\0. */
		line[ret - 1] = '\n';
		line[MIN(ret, SYSTR_BUF_SZ - 1)] = 0;
		_line = line;
		if (opts.raw_output) {
			fprintf(opts.outfile, "%s", _line);
			continue;
		}
		if (!opts.with_time) {
			/* move starting E or X marker */
			_line[40] = _line[0];
			/* skip over the timestamp */
			_line = &_line[40];
		}
		fprintf(opts.outfile, "%s", _line);
	}
	/* This is a little hokey.  If the process exited, then the qio hung up and
	 * we got a status message from the kernel.  This was the errstr of the last
	 * failed read.  However, if we're doing a -p and someone kills *us*, we'll
	 * never see this.  And catching the signal doesn't help either.  The
	 * process needs to exit (strace_shutdown).  Either that, or change the
	 * kernel to set_errstr() on close(), and coordinate with a sighandler. */
	if (opts.verbose)
		fprintf(stderr, "%r\n");
	free(line);
}

int main(int argc, char **argv, char **envp)
{
	int fd;
	pid_t pid;
	static char path[2 * MAX_PATH_LEN];
	struct syscall sysc;
	struct argp argp = {argp_opts, parse_strace_opt, args_doc, doc};

	argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, &opts);

	build_ignore_list(opts.trace_set);

	if (opts.cmd_argc) {
		pid = create_child_with_stdfds(opts.cmd_argv[0], opts.cmd_argc,
		                               opts.cmd_argv, envp);
		if (pid < 0) {
			perror("Unable to spawn child");
			exit(-1);
		}
		/* We need to wait on the child asynchronously.  If we hold a ref (as
		 * the parent), the child won't proc_free and that won't hangup/wake us
		 * from a read. */
		syscall_async(&sysc, SYS_waitpid, pid, NULL, 0, 0, 0, 0);
	} else {
		pid = opts.pid;
	}

	snprintf(path, sizeof(path), "/proc/%d/ctl", pid);
	fd = open(path, O_WRITE);
	if (fd < 0) {
		fprintf(stderr, "open %s: %r\n", path);
		exit(1);
	}
	if (opts.follow_children)
		snprintf(path, sizeof(path), "straceall");
	else
		snprintf(path, sizeof(path), "straceme");
	if (write(fd, path, strlen(path)) < strlen(path)) {
		fprintf(stderr, "write to ctl %s: %r\n", path);
		exit(1);
	}
	if (opts.drop_overflow) {
		snprintf(path, sizeof(path), "strace_drop on");
		if (write(fd, path, strlen(path)) < strlen(path)) {
			fprintf(stderr, "write to ctl %s: %r\n", path);
			exit(1);
		}
	}
	close(fd);

	if (opts.trace_set) {
		snprintf(path, sizeof(path), "/proc/%d/strace_traceset", pid);
		fd = open(path, O_RDWR);
		if (fd < 0) {
			fprintf(stderr, "open %s: %r\n", path);
			exit(1);
		}
		if (write(fd, traceset_bm, COUNT_OF(traceset_bm))
		    < COUNT_OF(traceset_bm)) {
			fprintf(stderr, "write to strace_ignore: %r\n");
			exit(1);
		}
		close(fd);
	}

	snprintf(path, sizeof(path), "/proc/%d/strace", pid);
	fd = open(path, O_READ);
	if (!fd) {
		fprintf(stderr, "open %s: %r\n", path);
		exit(1);
	}

	if (opts.cmd_argc) {
		/* now that we've set up the tracing, we can run the process.  isn't it
		 * great that the process doesn't immediately start when you make it? */
		sys_proc_run(pid);
	}

	parse_traces(fd);
	return 0;
}
