blob: 9fb942c68eb9f6432ab8d3dc87890aef44622c73 [file] [log] [blame]
David Brazdil6c63a262019-12-23 13:23:46 +00001#!/usr/bin/env python3
2#
David Brazdil5e0484e2019-08-07 15:06:57 +01003# Copyright 2019 The Hafnium Authors.
4#
Andrew Walbrane959ec12020-06-17 15:01:09 +01005# Use of this source code is governed by a BSD-style
6# license that can be found in the LICENSE file or at
7# https://opensource.org/licenses/BSD-3-Clause.
David Brazdil5e0484e2019-08-07 15:06:57 +01008
9"""Script which parses the output of `strace` and dumping a list of files
10that were touched by the traced processes outside of whitelisted folders.
11It assumes that strace was invoked with the following arguments:
12 -e trace=%file,chdir,%process record required syscalls
13 -qq silence 'exit code' records
14 -o <file> output format is different when writing
15 to a file from printing to the console
16"""
17
18import argparse
19import os
20import sys
21
22FORK_SYSCALLS = [
23 "clone",
24 "fork",
25 "vfork",
26 ]
27OPEN_SYSCALLS = [
28 "access",
29 "creat",
30 "lstat",
31 "mkdir",
32 "open",
33 "openat",
34 "readlink",
35 "stat",
36 ]
37
38def get_unfinished(line):
39 pos = line.find("<unfinished ...>")
40 if pos < 0:
41 return None
42 else:
43 return line[:pos]
44
45def get_resumed(line):
46 pos = line.find(" resumed>")
47 if pos < 0:
48 return None
49 else:
50 return line[pos + len(" resumed>"):]
51
52def merge_unfinished_lines(lines):
53 """Process input lines and merge those split by an interrupting syscall."""
54 # Lines in the order they were started being written.
55 finished = []
56
57 # Pending unfinished lines. Map from PID to index in `finished`.
58 cursor = {}
59
60 for line in lines:
61 pid = int(line.split()[0])
62
63 resumed = get_resumed(line)
64 if resumed is not None:
65 assert(pid in cursor)
66 unfinished = get_unfinished(resumed)
67 if unfinished is not None:
68 finished[cursor[pid]] += unfinished
69 else:
70 finished[cursor[pid]] += resumed
71 del(cursor[pid])
72 else:
73 assert(pid not in cursor)
74 unfinished = get_unfinished(line)
75 if unfinished is not None:
76 # Line is unfinished. Store its location to `cursor`.
77 cursor[pid] = len(finished)
78 finished += [ unfinished ]
79 else:
80 finished += [ line ]
81 return finished
82
83def abs_path(cwd, path):
84 """If `path` is relative, resolve it against the current working directory.
85 Also normalize the resulting path."""
86 if path[0] != '/':
87 path = os.path.join(cwd, path)
88 path = os.path.abspath(path)
89 # while '//' in path:
90 # path = path.replace('//', '/')
91 path = os.path.realpath(path)
92 return path
93
94def get_touched_files(lines, orig_cwd):
95 """Parse strace output and return all files that an open()-like syscall was
96 called on."""
97 files = set()
98
99 # Map from PID to the current working directory.
100 cwd = {}
101
102 # Map from PID to executable name
103 executable = {}
104
105 # Map from PID to the PID of the process which forked it.
106 fork_of = {}
107
108 first_pid = True
109 for line in lines:
110 # Split line: <pid> <syscall info>
111 line = line.split()
112 pid = int(line[0])
113 syscall = " ".join(line[1:])
114
115 # If seeing a PID for the first time, derive its working directory
116 # from its parent.
117 if pid not in cwd:
118 if first_pid:
119 # Very first line of strace output. Set working directory from
120 # command line arguments (should match cwd of strace).
121 first_pid = False
122 cwd[pid] = orig_cwd
123 else:
124 # There should have been a fork/clone syscall which spawned this
125 # process. Inherit its working directory.
126 cwd[pid] = cwd[fork_of[pid]]
127
128 # We are looking for lines which match:
129 # name(arg1, arg2, ..., argN) = result
130 left_bracket = syscall.find("(")
131 right_bracket = syscall.rfind(")")
132 assign_sign = syscall.rfind("=")
133 if left_bracket < 0 or right_bracket < 0 or assign_sign < right_bracket:
134 continue
135
136 syscall_name = syscall[:left_bracket]
137 syscall_result = syscall[assign_sign+2:]
138
139 syscall_args = syscall[left_bracket+1:right_bracket].split(",")
140 syscall_args = list(map(lambda x: x.strip(), syscall_args))
141
142 if syscall_name in FORK_SYSCALLS:
143 # If this is a fork, keep track of the parent-child relationship.
144 # The child's PID is the syscall's return code.
145 new_pid = int(syscall_result)
146 fork_of[new_pid] = pid
147 executable[new_pid] = executable[pid]
148 elif syscall_name == "chdir":
149 # If this is a change of working directory, keep track of it.
150 # It is in the first argument in quotes.
151 new_dir = syscall_args[0][1:-1]
152 cwd[pid] = abs_path(cwd[pid], new_dir)
153 elif syscall_name == "execve":
154 # If this is executing a new program, record its name.
155 # It is in the first argument in quotes.
156 binary_name = syscall_args[0][1:-1]
157 executable[pid] = binary_name
158 elif syscall_name in OPEN_SYSCALLS:
159 # If this is a syscall touching a file, record the path.
160 # We ignore the result code, i.e. record the path even if the
161 # syscall failed to open it.
162 arg_idx = 0
163 if syscall_name == "openat":
164 # openat() can open a file (second arg) relative to a given
165 # folder (first arg). We only support passing AT_FDCWD, ie.
166 # resolve against the current working directory.
167 arg_idx = 1
168 assert(syscall_args[0] == "AT_FDCWD")
169 fname = abs_path(cwd[pid], syscall_args[arg_idx][1:-1])
170 # Record the file and the name of the program which touched it.
171 files.add((fname, executable[pid]))
172 return files
173
174def filter_results(files, root_dir):
175 """Remove paths which are whitelisted from the results."""
176 # Anything in the Hafnium directory is allowed.
177 files = filter(lambda x: not x[0].startswith(root_dir + "/"), files)
178 # Clang puts intermediate files in /tmp.
179 files = filter(lambda x: not x[0].startswith("/tmp/"), files)
180 return list(files)
181
182def main(args):
183 parser = argparse.ArgumentParser()
184 parser.add_argument("root_dir",
185 help="Root directory of Hafnium, cwd of strace")
186 args, make_args = parser.parse_known_args()
187
188 stdin = map(lambda x: x.strip(), sys.stdin.readlines())
189 stdin = merge_unfinished_lines(stdin)
190 files = get_touched_files(stdin, args.root_dir)
191 files = filter_results(files, args.root_dir)
192 files = sorted(list(files))
193
194 print("\n".join(map(lambda x: "{} ({})".format(x[0], x[1]), files)))
195
196if __name__ == "__main__":
197 main(sys.argv)