blob: 5dd878bc194e7ca05481e24d3b632e472792758c [file] [log] [blame]
David Brazdil5e0484e2019-08-07 15:06:57 +01001#!/usr/bin/env python
2# Copyright 2019 The Hafnium Authors.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# https://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Script which parses the output of `strace` and dumping a list of files
17that were touched by the traced processes outside of whitelisted folders.
18It assumes that strace was invoked with the following arguments:
19 -e trace=%file,chdir,%process record required syscalls
20 -qq silence 'exit code' records
21 -o <file> output format is different when writing
22 to a file from printing to the console
23"""
24
25import argparse
26import os
27import sys
28
29FORK_SYSCALLS = [
30 "clone",
31 "fork",
32 "vfork",
33 ]
34OPEN_SYSCALLS = [
35 "access",
36 "creat",
37 "lstat",
38 "mkdir",
39 "open",
40 "openat",
41 "readlink",
42 "stat",
43 ]
44
45def get_unfinished(line):
46 pos = line.find("<unfinished ...>")
47 if pos < 0:
48 return None
49 else:
50 return line[:pos]
51
52def get_resumed(line):
53 pos = line.find(" resumed>")
54 if pos < 0:
55 return None
56 else:
57 return line[pos + len(" resumed>"):]
58
59def merge_unfinished_lines(lines):
60 """Process input lines and merge those split by an interrupting syscall."""
61 # Lines in the order they were started being written.
62 finished = []
63
64 # Pending unfinished lines. Map from PID to index in `finished`.
65 cursor = {}
66
67 for line in lines:
68 pid = int(line.split()[0])
69
70 resumed = get_resumed(line)
71 if resumed is not None:
72 assert(pid in cursor)
73 unfinished = get_unfinished(resumed)
74 if unfinished is not None:
75 finished[cursor[pid]] += unfinished
76 else:
77 finished[cursor[pid]] += resumed
78 del(cursor[pid])
79 else:
80 assert(pid not in cursor)
81 unfinished = get_unfinished(line)
82 if unfinished is not None:
83 # Line is unfinished. Store its location to `cursor`.
84 cursor[pid] = len(finished)
85 finished += [ unfinished ]
86 else:
87 finished += [ line ]
88 return finished
89
90def abs_path(cwd, path):
91 """If `path` is relative, resolve it against the current working directory.
92 Also normalize the resulting path."""
93 if path[0] != '/':
94 path = os.path.join(cwd, path)
95 path = os.path.abspath(path)
96 # while '//' in path:
97 # path = path.replace('//', '/')
98 path = os.path.realpath(path)
99 return path
100
101def get_touched_files(lines, orig_cwd):
102 """Parse strace output and return all files that an open()-like syscall was
103 called on."""
104 files = set()
105
106 # Map from PID to the current working directory.
107 cwd = {}
108
109 # Map from PID to executable name
110 executable = {}
111
112 # Map from PID to the PID of the process which forked it.
113 fork_of = {}
114
115 first_pid = True
116 for line in lines:
117 # Split line: <pid> <syscall info>
118 line = line.split()
119 pid = int(line[0])
120 syscall = " ".join(line[1:])
121
122 # If seeing a PID for the first time, derive its working directory
123 # from its parent.
124 if pid not in cwd:
125 if first_pid:
126 # Very first line of strace output. Set working directory from
127 # command line arguments (should match cwd of strace).
128 first_pid = False
129 cwd[pid] = orig_cwd
130 else:
131 # There should have been a fork/clone syscall which spawned this
132 # process. Inherit its working directory.
133 cwd[pid] = cwd[fork_of[pid]]
134
135 # We are looking for lines which match:
136 # name(arg1, arg2, ..., argN) = result
137 left_bracket = syscall.find("(")
138 right_bracket = syscall.rfind(")")
139 assign_sign = syscall.rfind("=")
140 if left_bracket < 0 or right_bracket < 0 or assign_sign < right_bracket:
141 continue
142
143 syscall_name = syscall[:left_bracket]
144 syscall_result = syscall[assign_sign+2:]
145
146 syscall_args = syscall[left_bracket+1:right_bracket].split(",")
147 syscall_args = list(map(lambda x: x.strip(), syscall_args))
148
149 if syscall_name in FORK_SYSCALLS:
150 # If this is a fork, keep track of the parent-child relationship.
151 # The child's PID is the syscall's return code.
152 new_pid = int(syscall_result)
153 fork_of[new_pid] = pid
154 executable[new_pid] = executable[pid]
155 elif syscall_name == "chdir":
156 # If this is a change of working directory, keep track of it.
157 # It is in the first argument in quotes.
158 new_dir = syscall_args[0][1:-1]
159 cwd[pid] = abs_path(cwd[pid], new_dir)
160 elif syscall_name == "execve":
161 # If this is executing a new program, record its name.
162 # It is in the first argument in quotes.
163 binary_name = syscall_args[0][1:-1]
164 executable[pid] = binary_name
165 elif syscall_name in OPEN_SYSCALLS:
166 # If this is a syscall touching a file, record the path.
167 # We ignore the result code, i.e. record the path even if the
168 # syscall failed to open it.
169 arg_idx = 0
170 if syscall_name == "openat":
171 # openat() can open a file (second arg) relative to a given
172 # folder (first arg). We only support passing AT_FDCWD, ie.
173 # resolve against the current working directory.
174 arg_idx = 1
175 assert(syscall_args[0] == "AT_FDCWD")
176 fname = abs_path(cwd[pid], syscall_args[arg_idx][1:-1])
177 # Record the file and the name of the program which touched it.
178 files.add((fname, executable[pid]))
179 return files
180
181def filter_results(files, root_dir):
182 """Remove paths which are whitelisted from the results."""
183 # Anything in the Hafnium directory is allowed.
184 files = filter(lambda x: not x[0].startswith(root_dir + "/"), files)
185 # Clang puts intermediate files in /tmp.
186 files = filter(lambda x: not x[0].startswith("/tmp/"), files)
187 return list(files)
188
189def main(args):
190 parser = argparse.ArgumentParser()
191 parser.add_argument("root_dir",
192 help="Root directory of Hafnium, cwd of strace")
193 args, make_args = parser.parse_known_args()
194
195 stdin = map(lambda x: x.strip(), sys.stdin.readlines())
196 stdin = merge_unfinished_lines(stdin)
197 files = get_touched_files(stdin, args.root_dir)
198 files = filter_results(files, args.root_dir)
199 files = sorted(list(files))
200
201 print("\n".join(map(lambda x: "{} ({})".format(x[0], x[1]), files)))
202
203if __name__ == "__main__":
204 main(sys.argv)