blob: a9712205bc463db7dd7606395de74e2dfe66c72d [file] [log] [blame]
David Brazdil6c63a262019-12-23 13:23:46 +00001#!/usr/bin/env python3
2#
David Brazdil5e0484e2019-08-07 15:06:57 +01003# Copyright 2019 The Hafnium Authors.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# https://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""Script which parses the output of `strace` and dumping a list of files
18that were touched by the traced processes outside of whitelisted folders.
19It assumes that strace was invoked with the following arguments:
20 -e trace=%file,chdir,%process record required syscalls
21 -qq silence 'exit code' records
22 -o <file> output format is different when writing
23 to a file from printing to the console
24"""
25
26import argparse
27import os
28import sys
29
30FORK_SYSCALLS = [
31 "clone",
32 "fork",
33 "vfork",
34 ]
35OPEN_SYSCALLS = [
36 "access",
37 "creat",
38 "lstat",
39 "mkdir",
40 "open",
41 "openat",
42 "readlink",
43 "stat",
44 ]
45
46def get_unfinished(line):
47 pos = line.find("<unfinished ...>")
48 if pos < 0:
49 return None
50 else:
51 return line[:pos]
52
53def get_resumed(line):
54 pos = line.find(" resumed>")
55 if pos < 0:
56 return None
57 else:
58 return line[pos + len(" resumed>"):]
59
60def merge_unfinished_lines(lines):
61 """Process input lines and merge those split by an interrupting syscall."""
62 # Lines in the order they were started being written.
63 finished = []
64
65 # Pending unfinished lines. Map from PID to index in `finished`.
66 cursor = {}
67
68 for line in lines:
69 pid = int(line.split()[0])
70
71 resumed = get_resumed(line)
72 if resumed is not None:
73 assert(pid in cursor)
74 unfinished = get_unfinished(resumed)
75 if unfinished is not None:
76 finished[cursor[pid]] += unfinished
77 else:
78 finished[cursor[pid]] += resumed
79 del(cursor[pid])
80 else:
81 assert(pid not in cursor)
82 unfinished = get_unfinished(line)
83 if unfinished is not None:
84 # Line is unfinished. Store its location to `cursor`.
85 cursor[pid] = len(finished)
86 finished += [ unfinished ]
87 else:
88 finished += [ line ]
89 return finished
90
91def abs_path(cwd, path):
92 """If `path` is relative, resolve it against the current working directory.
93 Also normalize the resulting path."""
94 if path[0] != '/':
95 path = os.path.join(cwd, path)
96 path = os.path.abspath(path)
97 # while '//' in path:
98 # path = path.replace('//', '/')
99 path = os.path.realpath(path)
100 return path
101
102def get_touched_files(lines, orig_cwd):
103 """Parse strace output and return all files that an open()-like syscall was
104 called on."""
105 files = set()
106
107 # Map from PID to the current working directory.
108 cwd = {}
109
110 # Map from PID to executable name
111 executable = {}
112
113 # Map from PID to the PID of the process which forked it.
114 fork_of = {}
115
116 first_pid = True
117 for line in lines:
118 # Split line: <pid> <syscall info>
119 line = line.split()
120 pid = int(line[0])
121 syscall = " ".join(line[1:])
122
123 # If seeing a PID for the first time, derive its working directory
124 # from its parent.
125 if pid not in cwd:
126 if first_pid:
127 # Very first line of strace output. Set working directory from
128 # command line arguments (should match cwd of strace).
129 first_pid = False
130 cwd[pid] = orig_cwd
131 else:
132 # There should have been a fork/clone syscall which spawned this
133 # process. Inherit its working directory.
134 cwd[pid] = cwd[fork_of[pid]]
135
136 # We are looking for lines which match:
137 # name(arg1, arg2, ..., argN) = result
138 left_bracket = syscall.find("(")
139 right_bracket = syscall.rfind(")")
140 assign_sign = syscall.rfind("=")
141 if left_bracket < 0 or right_bracket < 0 or assign_sign < right_bracket:
142 continue
143
144 syscall_name = syscall[:left_bracket]
145 syscall_result = syscall[assign_sign+2:]
146
147 syscall_args = syscall[left_bracket+1:right_bracket].split(",")
148 syscall_args = list(map(lambda x: x.strip(), syscall_args))
149
150 if syscall_name in FORK_SYSCALLS:
151 # If this is a fork, keep track of the parent-child relationship.
152 # The child's PID is the syscall's return code.
153 new_pid = int(syscall_result)
154 fork_of[new_pid] = pid
155 executable[new_pid] = executable[pid]
156 elif syscall_name == "chdir":
157 # If this is a change of working directory, keep track of it.
158 # It is in the first argument in quotes.
159 new_dir = syscall_args[0][1:-1]
160 cwd[pid] = abs_path(cwd[pid], new_dir)
161 elif syscall_name == "execve":
162 # If this is executing a new program, record its name.
163 # It is in the first argument in quotes.
164 binary_name = syscall_args[0][1:-1]
165 executable[pid] = binary_name
166 elif syscall_name in OPEN_SYSCALLS:
167 # If this is a syscall touching a file, record the path.
168 # We ignore the result code, i.e. record the path even if the
169 # syscall failed to open it.
170 arg_idx = 0
171 if syscall_name == "openat":
172 # openat() can open a file (second arg) relative to a given
173 # folder (first arg). We only support passing AT_FDCWD, ie.
174 # resolve against the current working directory.
175 arg_idx = 1
176 assert(syscall_args[0] == "AT_FDCWD")
177 fname = abs_path(cwd[pid], syscall_args[arg_idx][1:-1])
178 # Record the file and the name of the program which touched it.
179 files.add((fname, executable[pid]))
180 return files
181
182def filter_results(files, root_dir):
183 """Remove paths which are whitelisted from the results."""
184 # Anything in the Hafnium directory is allowed.
185 files = filter(lambda x: not x[0].startswith(root_dir + "/"), files)
186 # Clang puts intermediate files in /tmp.
187 files = filter(lambda x: not x[0].startswith("/tmp/"), files)
188 return list(files)
189
190def main(args):
191 parser = argparse.ArgumentParser()
192 parser.add_argument("root_dir",
193 help="Root directory of Hafnium, cwd of strace")
194 args, make_args = parser.parse_known_args()
195
196 stdin = map(lambda x: x.strip(), sys.stdin.readlines())
197 stdin = merge_unfinished_lines(stdin)
198 files = get_touched_files(stdin, args.root_dir)
199 files = filter_results(files, args.root_dir)
200 files = sorted(list(files))
201
202 print("\n".join(map(lambda x: "{} ({})".format(x[0], x[1]), files)))
203
204if __name__ == "__main__":
205 main(sys.argv)