David Brazdil | 6c63a26 | 2019-12-23 13:23:46 +0000 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | # |
David Brazdil | 5e0484e | 2019-08-07 15:06:57 +0100 | [diff] [blame] | 3 | # Copyright 2019 The Hafnium Authors. |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | # you may not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # https://www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | |
| 17 | """Script which parses the output of `strace` and dumping a list of files |
| 18 | that were touched by the traced processes outside of whitelisted folders. |
| 19 | It assumes that strace was invoked with the following arguments: |
| 20 | -e trace=%file,chdir,%process record required syscalls |
| 21 | -qq silence 'exit code' records |
| 22 | -o <file> output format is different when writing |
| 23 | to a file from printing to the console |
| 24 | """ |
| 25 | |
| 26 | import argparse |
| 27 | import os |
| 28 | import sys |
| 29 | |
| 30 | FORK_SYSCALLS = [ |
| 31 | "clone", |
| 32 | "fork", |
| 33 | "vfork", |
| 34 | ] |
| 35 | OPEN_SYSCALLS = [ |
| 36 | "access", |
| 37 | "creat", |
| 38 | "lstat", |
| 39 | "mkdir", |
| 40 | "open", |
| 41 | "openat", |
| 42 | "readlink", |
| 43 | "stat", |
| 44 | ] |
| 45 | |
| 46 | def get_unfinished(line): |
| 47 | pos = line.find("<unfinished ...>") |
| 48 | if pos < 0: |
| 49 | return None |
| 50 | else: |
| 51 | return line[:pos] |
| 52 | |
| 53 | def get_resumed(line): |
| 54 | pos = line.find(" resumed>") |
| 55 | if pos < 0: |
| 56 | return None |
| 57 | else: |
| 58 | return line[pos + len(" resumed>"):] |
| 59 | |
| 60 | def merge_unfinished_lines(lines): |
| 61 | """Process input lines and merge those split by an interrupting syscall.""" |
| 62 | # Lines in the order they were started being written. |
| 63 | finished = [] |
| 64 | |
| 65 | # Pending unfinished lines. Map from PID to index in `finished`. |
| 66 | cursor = {} |
| 67 | |
| 68 | for line in lines: |
| 69 | pid = int(line.split()[0]) |
| 70 | |
| 71 | resumed = get_resumed(line) |
| 72 | if resumed is not None: |
| 73 | assert(pid in cursor) |
| 74 | unfinished = get_unfinished(resumed) |
| 75 | if unfinished is not None: |
| 76 | finished[cursor[pid]] += unfinished |
| 77 | else: |
| 78 | finished[cursor[pid]] += resumed |
| 79 | del(cursor[pid]) |
| 80 | else: |
| 81 | assert(pid not in cursor) |
| 82 | unfinished = get_unfinished(line) |
| 83 | if unfinished is not None: |
| 84 | # Line is unfinished. Store its location to `cursor`. |
| 85 | cursor[pid] = len(finished) |
| 86 | finished += [ unfinished ] |
| 87 | else: |
| 88 | finished += [ line ] |
| 89 | return finished |
| 90 | |
| 91 | def abs_path(cwd, path): |
| 92 | """If `path` is relative, resolve it against the current working directory. |
| 93 | Also normalize the resulting path.""" |
| 94 | if path[0] != '/': |
| 95 | path = os.path.join(cwd, path) |
| 96 | path = os.path.abspath(path) |
| 97 | # while '//' in path: |
| 98 | # path = path.replace('//', '/') |
| 99 | path = os.path.realpath(path) |
| 100 | return path |
| 101 | |
| 102 | def get_touched_files(lines, orig_cwd): |
| 103 | """Parse strace output and return all files that an open()-like syscall was |
| 104 | called on.""" |
| 105 | files = set() |
| 106 | |
| 107 | # Map from PID to the current working directory. |
| 108 | cwd = {} |
| 109 | |
| 110 | # Map from PID to executable name |
| 111 | executable = {} |
| 112 | |
| 113 | # Map from PID to the PID of the process which forked it. |
| 114 | fork_of = {} |
| 115 | |
| 116 | first_pid = True |
| 117 | for line in lines: |
| 118 | # Split line: <pid> <syscall info> |
| 119 | line = line.split() |
| 120 | pid = int(line[0]) |
| 121 | syscall = " ".join(line[1:]) |
| 122 | |
| 123 | # If seeing a PID for the first time, derive its working directory |
| 124 | # from its parent. |
| 125 | if pid not in cwd: |
| 126 | if first_pid: |
| 127 | # Very first line of strace output. Set working directory from |
| 128 | # command line arguments (should match cwd of strace). |
| 129 | first_pid = False |
| 130 | cwd[pid] = orig_cwd |
| 131 | else: |
| 132 | # There should have been a fork/clone syscall which spawned this |
| 133 | # process. Inherit its working directory. |
| 134 | cwd[pid] = cwd[fork_of[pid]] |
| 135 | |
| 136 | # We are looking for lines which match: |
| 137 | # name(arg1, arg2, ..., argN) = result |
| 138 | left_bracket = syscall.find("(") |
| 139 | right_bracket = syscall.rfind(")") |
| 140 | assign_sign = syscall.rfind("=") |
| 141 | if left_bracket < 0 or right_bracket < 0 or assign_sign < right_bracket: |
| 142 | continue |
| 143 | |
| 144 | syscall_name = syscall[:left_bracket] |
| 145 | syscall_result = syscall[assign_sign+2:] |
| 146 | |
| 147 | syscall_args = syscall[left_bracket+1:right_bracket].split(",") |
| 148 | syscall_args = list(map(lambda x: x.strip(), syscall_args)) |
| 149 | |
| 150 | if syscall_name in FORK_SYSCALLS: |
| 151 | # If this is a fork, keep track of the parent-child relationship. |
| 152 | # The child's PID is the syscall's return code. |
| 153 | new_pid = int(syscall_result) |
| 154 | fork_of[new_pid] = pid |
| 155 | executable[new_pid] = executable[pid] |
| 156 | elif syscall_name == "chdir": |
| 157 | # If this is a change of working directory, keep track of it. |
| 158 | # It is in the first argument in quotes. |
| 159 | new_dir = syscall_args[0][1:-1] |
| 160 | cwd[pid] = abs_path(cwd[pid], new_dir) |
| 161 | elif syscall_name == "execve": |
| 162 | # If this is executing a new program, record its name. |
| 163 | # It is in the first argument in quotes. |
| 164 | binary_name = syscall_args[0][1:-1] |
| 165 | executable[pid] = binary_name |
| 166 | elif syscall_name in OPEN_SYSCALLS: |
| 167 | # If this is a syscall touching a file, record the path. |
| 168 | # We ignore the result code, i.e. record the path even if the |
| 169 | # syscall failed to open it. |
| 170 | arg_idx = 0 |
| 171 | if syscall_name == "openat": |
| 172 | # openat() can open a file (second arg) relative to a given |
| 173 | # folder (first arg). We only support passing AT_FDCWD, ie. |
| 174 | # resolve against the current working directory. |
| 175 | arg_idx = 1 |
| 176 | assert(syscall_args[0] == "AT_FDCWD") |
| 177 | fname = abs_path(cwd[pid], syscall_args[arg_idx][1:-1]) |
| 178 | # Record the file and the name of the program which touched it. |
| 179 | files.add((fname, executable[pid])) |
| 180 | return files |
| 181 | |
| 182 | def filter_results(files, root_dir): |
| 183 | """Remove paths which are whitelisted from the results.""" |
| 184 | # Anything in the Hafnium directory is allowed. |
| 185 | files = filter(lambda x: not x[0].startswith(root_dir + "/"), files) |
| 186 | # Clang puts intermediate files in /tmp. |
| 187 | files = filter(lambda x: not x[0].startswith("/tmp/"), files) |
| 188 | return list(files) |
| 189 | |
| 190 | def main(args): |
| 191 | parser = argparse.ArgumentParser() |
| 192 | parser.add_argument("root_dir", |
| 193 | help="Root directory of Hafnium, cwd of strace") |
| 194 | args, make_args = parser.parse_known_args() |
| 195 | |
| 196 | stdin = map(lambda x: x.strip(), sys.stdin.readlines()) |
| 197 | stdin = merge_unfinished_lines(stdin) |
| 198 | files = get_touched_files(stdin, args.root_dir) |
| 199 | files = filter_results(files, args.root_dir) |
| 200 | files = sorted(list(files)) |
| 201 | |
| 202 | print("\n".join(map(lambda x: "{} ({})".format(x[0], x[1]), files))) |
| 203 | |
| 204 | if __name__ == "__main__": |
| 205 | main(sys.argv) |