David Brazdil | 6c63a26 | 2019-12-23 13:23:46 +0000 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | # |
David Brazdil | 5e0484e | 2019-08-07 15:06:57 +0100 | [diff] [blame] | 3 | # Copyright 2019 The Hafnium Authors. |
| 4 | # |
Andrew Walbran | e959ec1 | 2020-06-17 15:01:09 +0100 | [diff] [blame^] | 5 | # Use of this source code is governed by a BSD-style |
| 6 | # license that can be found in the LICENSE file or at |
| 7 | # https://opensource.org/licenses/BSD-3-Clause. |
David Brazdil | 5e0484e | 2019-08-07 15:06:57 +0100 | [diff] [blame] | 8 | |
| 9 | """Script which parses the output of `strace` and dumping a list of files |
| 10 | that were touched by the traced processes outside of whitelisted folders. |
| 11 | It assumes that strace was invoked with the following arguments: |
| 12 | -e trace=%file,chdir,%process record required syscalls |
| 13 | -qq silence 'exit code' records |
| 14 | -o <file> output format is different when writing |
| 15 | to a file from printing to the console |
| 16 | """ |
| 17 | |
| 18 | import argparse |
| 19 | import os |
| 20 | import sys |
| 21 | |
| 22 | FORK_SYSCALLS = [ |
| 23 | "clone", |
| 24 | "fork", |
| 25 | "vfork", |
| 26 | ] |
| 27 | OPEN_SYSCALLS = [ |
| 28 | "access", |
| 29 | "creat", |
| 30 | "lstat", |
| 31 | "mkdir", |
| 32 | "open", |
| 33 | "openat", |
| 34 | "readlink", |
| 35 | "stat", |
| 36 | ] |
| 37 | |
| 38 | def get_unfinished(line): |
| 39 | pos = line.find("<unfinished ...>") |
| 40 | if pos < 0: |
| 41 | return None |
| 42 | else: |
| 43 | return line[:pos] |
| 44 | |
| 45 | def get_resumed(line): |
| 46 | pos = line.find(" resumed>") |
| 47 | if pos < 0: |
| 48 | return None |
| 49 | else: |
| 50 | return line[pos + len(" resumed>"):] |
| 51 | |
| 52 | def merge_unfinished_lines(lines): |
| 53 | """Process input lines and merge those split by an interrupting syscall.""" |
| 54 | # Lines in the order they were started being written. |
| 55 | finished = [] |
| 56 | |
| 57 | # Pending unfinished lines. Map from PID to index in `finished`. |
| 58 | cursor = {} |
| 59 | |
| 60 | for line in lines: |
| 61 | pid = int(line.split()[0]) |
| 62 | |
| 63 | resumed = get_resumed(line) |
| 64 | if resumed is not None: |
| 65 | assert(pid in cursor) |
| 66 | unfinished = get_unfinished(resumed) |
| 67 | if unfinished is not None: |
| 68 | finished[cursor[pid]] += unfinished |
| 69 | else: |
| 70 | finished[cursor[pid]] += resumed |
| 71 | del(cursor[pid]) |
| 72 | else: |
| 73 | assert(pid not in cursor) |
| 74 | unfinished = get_unfinished(line) |
| 75 | if unfinished is not None: |
| 76 | # Line is unfinished. Store its location to `cursor`. |
| 77 | cursor[pid] = len(finished) |
| 78 | finished += [ unfinished ] |
| 79 | else: |
| 80 | finished += [ line ] |
| 81 | return finished |
| 82 | |
| 83 | def abs_path(cwd, path): |
| 84 | """If `path` is relative, resolve it against the current working directory. |
| 85 | Also normalize the resulting path.""" |
| 86 | if path[0] != '/': |
| 87 | path = os.path.join(cwd, path) |
| 88 | path = os.path.abspath(path) |
| 89 | # while '//' in path: |
| 90 | # path = path.replace('//', '/') |
| 91 | path = os.path.realpath(path) |
| 92 | return path |
| 93 | |
| 94 | def get_touched_files(lines, orig_cwd): |
| 95 | """Parse strace output and return all files that an open()-like syscall was |
| 96 | called on.""" |
| 97 | files = set() |
| 98 | |
| 99 | # Map from PID to the current working directory. |
| 100 | cwd = {} |
| 101 | |
| 102 | # Map from PID to executable name |
| 103 | executable = {} |
| 104 | |
| 105 | # Map from PID to the PID of the process which forked it. |
| 106 | fork_of = {} |
| 107 | |
| 108 | first_pid = True |
| 109 | for line in lines: |
| 110 | # Split line: <pid> <syscall info> |
| 111 | line = line.split() |
| 112 | pid = int(line[0]) |
| 113 | syscall = " ".join(line[1:]) |
| 114 | |
| 115 | # If seeing a PID for the first time, derive its working directory |
| 116 | # from its parent. |
| 117 | if pid not in cwd: |
| 118 | if first_pid: |
| 119 | # Very first line of strace output. Set working directory from |
| 120 | # command line arguments (should match cwd of strace). |
| 121 | first_pid = False |
| 122 | cwd[pid] = orig_cwd |
| 123 | else: |
| 124 | # There should have been a fork/clone syscall which spawned this |
| 125 | # process. Inherit its working directory. |
| 126 | cwd[pid] = cwd[fork_of[pid]] |
| 127 | |
| 128 | # We are looking for lines which match: |
| 129 | # name(arg1, arg2, ..., argN) = result |
| 130 | left_bracket = syscall.find("(") |
| 131 | right_bracket = syscall.rfind(")") |
| 132 | assign_sign = syscall.rfind("=") |
| 133 | if left_bracket < 0 or right_bracket < 0 or assign_sign < right_bracket: |
| 134 | continue |
| 135 | |
| 136 | syscall_name = syscall[:left_bracket] |
| 137 | syscall_result = syscall[assign_sign+2:] |
| 138 | |
| 139 | syscall_args = syscall[left_bracket+1:right_bracket].split(",") |
| 140 | syscall_args = list(map(lambda x: x.strip(), syscall_args)) |
| 141 | |
| 142 | if syscall_name in FORK_SYSCALLS: |
| 143 | # If this is a fork, keep track of the parent-child relationship. |
| 144 | # The child's PID is the syscall's return code. |
| 145 | new_pid = int(syscall_result) |
| 146 | fork_of[new_pid] = pid |
| 147 | executable[new_pid] = executable[pid] |
| 148 | elif syscall_name == "chdir": |
| 149 | # If this is a change of working directory, keep track of it. |
| 150 | # It is in the first argument in quotes. |
| 151 | new_dir = syscall_args[0][1:-1] |
| 152 | cwd[pid] = abs_path(cwd[pid], new_dir) |
| 153 | elif syscall_name == "execve": |
| 154 | # If this is executing a new program, record its name. |
| 155 | # It is in the first argument in quotes. |
| 156 | binary_name = syscall_args[0][1:-1] |
| 157 | executable[pid] = binary_name |
| 158 | elif syscall_name in OPEN_SYSCALLS: |
| 159 | # If this is a syscall touching a file, record the path. |
| 160 | # We ignore the result code, i.e. record the path even if the |
| 161 | # syscall failed to open it. |
| 162 | arg_idx = 0 |
| 163 | if syscall_name == "openat": |
| 164 | # openat() can open a file (second arg) relative to a given |
| 165 | # folder (first arg). We only support passing AT_FDCWD, ie. |
| 166 | # resolve against the current working directory. |
| 167 | arg_idx = 1 |
| 168 | assert(syscall_args[0] == "AT_FDCWD") |
| 169 | fname = abs_path(cwd[pid], syscall_args[arg_idx][1:-1]) |
| 170 | # Record the file and the name of the program which touched it. |
| 171 | files.add((fname, executable[pid])) |
| 172 | return files |
| 173 | |
| 174 | def filter_results(files, root_dir): |
| 175 | """Remove paths which are whitelisted from the results.""" |
| 176 | # Anything in the Hafnium directory is allowed. |
| 177 | files = filter(lambda x: not x[0].startswith(root_dir + "/"), files) |
| 178 | # Clang puts intermediate files in /tmp. |
| 179 | files = filter(lambda x: not x[0].startswith("/tmp/"), files) |
| 180 | return list(files) |
| 181 | |
| 182 | def main(args): |
| 183 | parser = argparse.ArgumentParser() |
| 184 | parser.add_argument("root_dir", |
| 185 | help="Root directory of Hafnium, cwd of strace") |
| 186 | args, make_args = parser.parse_known_args() |
| 187 | |
| 188 | stdin = map(lambda x: x.strip(), sys.stdin.readlines()) |
| 189 | stdin = merge_unfinished_lines(stdin) |
| 190 | files = get_touched_files(stdin, args.root_dir) |
| 191 | files = filter_results(files, args.root_dir) |
| 192 | files = sorted(list(files)) |
| 193 | |
| 194 | print("\n".join(map(lambda x: "{} ({})".format(x[0], x[1]), files))) |
| 195 | |
| 196 | if __name__ == "__main__": |
| 197 | main(sys.argv) |