Add script to dump system files used by build

In order to move towards builds which only use resources in the Hafnium
repo, this patch adds a script which runs the build with strace and
dumps all files touched in the process. Files in the Hafnium directory
and in /tmp are automatically filtered out.

Bug: 132428451
Test: ./build/strace_open.sh opened_files.txt
Change-Id: I03a2df4eedf40c456b65920ec8bf98ad08e747c6
diff --git a/build/parse_strace_open.py b/build/parse_strace_open.py
new file mode 100755
index 0000000..5dd878b
--- /dev/null
+++ b/build/parse_strace_open.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+# Copyright 2019 The Hafnium Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script which parses the output of `strace` and dumping a list of files
+that were touched by the traced processes outside of whitelisted folders.
+It assumes that strace was invoked with the following arguments:
+    -e trace=%file,chdir,%process   record required syscalls
+    -qq                             silence 'exit code' records
+    -o <file>                       output format is different when writing
+                                    to a file from printing to the console
+"""
+
+import argparse
+import os
+import sys
+
+FORK_SYSCALLS = [
+    "clone",
+    "fork",
+    "vfork",
+    ]
+OPEN_SYSCALLS = [
+    "access",
+    "creat",
+    "lstat",
+    "mkdir",
+    "open",
+    "openat",
+    "readlink",
+    "stat",
+    ]
+
+def get_unfinished(line):
+    pos = line.find("<unfinished ...>")
+    if pos < 0:
+        return None
+    else:
+        return line[:pos]
+
+def get_resumed(line):
+    pos = line.find(" resumed>")
+    if pos < 0:
+        return None
+    else:
+        return line[pos + len(" resumed>"):]
+
+def merge_unfinished_lines(lines):
+    """Process input lines and merge those split by an interrupting syscall."""
+    # Lines in the order they were started being written.
+    finished = []
+
+    # Pending unfinished lines. Map from PID to index in `finished`.
+    cursor = {}
+
+    for line in lines:
+        pid = int(line.split()[0])
+
+        resumed = get_resumed(line)
+        if resumed is not None:
+            assert(pid in cursor)
+            unfinished = get_unfinished(resumed)
+            if unfinished is not None:
+                finished[cursor[pid]] += unfinished
+            else:
+                finished[cursor[pid]] += resumed
+                del(cursor[pid])
+        else:
+            assert(pid not in cursor)
+            unfinished = get_unfinished(line)
+            if unfinished is not None:
+                # Line is unfinished. Store its location to `cursor`.
+                cursor[pid] = len(finished)
+                finished += [ unfinished ]
+            else:
+                finished += [ line ]
+    return finished
+
+def abs_path(cwd, path):
+    """If `path` is relative, resolve it against the current working directory.
+       Also normalize the resulting path."""
+    if path[0] != '/':
+        path = os.path.join(cwd, path)
+    path = os.path.abspath(path)
+    # while '//' in path:
+    #     path = path.replace('//', '/')
+    path = os.path.realpath(path)
+    return path
+
+def get_touched_files(lines, orig_cwd):
+    """Parse strace output and return all files that an open()-like syscall was
+       called on."""
+    files = set()
+
+    # Map from PID to the current working directory.
+    cwd = {}
+
+    # Map from PID to executable name
+    executable = {}
+
+    # Map from PID to the PID of the process which forked it.
+    fork_of = {}
+
+    first_pid = True
+    for line in lines:
+        # Split line: <pid>  <syscall info>
+        line = line.split()
+        pid = int(line[0])
+        syscall = " ".join(line[1:])
+
+        # If seeing a PID for the first time, derive its working directory
+        # from its parent.
+        if pid not in cwd:
+            if first_pid:
+                # Very first line of strace output. Set working directory from
+                # command line arguments (should match cwd of strace).
+                first_pid = False
+                cwd[pid] = orig_cwd
+            else:
+                # There should have been a fork/clone syscall which spawned this
+                # process. Inherit its working directory.
+                cwd[pid] = cwd[fork_of[pid]]
+
+        # We are looking for lines which match:
+        #   name(arg1, arg2, ..., argN) = result
+        left_bracket = syscall.find("(")
+        right_bracket = syscall.rfind(")")
+        assign_sign = syscall.rfind("=")
+        if left_bracket < 0 or right_bracket < 0 or assign_sign < right_bracket:
+            continue
+
+        syscall_name = syscall[:left_bracket]
+        syscall_result = syscall[assign_sign+2:]
+
+        syscall_args = syscall[left_bracket+1:right_bracket].split(",")
+        syscall_args = list(map(lambda x: x.strip(), syscall_args))
+
+        if syscall_name in FORK_SYSCALLS:
+            # If this is a fork, keep track of the parent-child relationship.
+            # The child's PID is the syscall's return code.
+            new_pid = int(syscall_result)
+            fork_of[new_pid] = pid
+            executable[new_pid] = executable[pid]
+        elif syscall_name == "chdir":
+            # If this is a change of working directory, keep track of it.
+            # It is in the first argument in quotes.
+            new_dir = syscall_args[0][1:-1]
+            cwd[pid] = abs_path(cwd[pid], new_dir)
+        elif syscall_name == "execve":
+            # If this is executing a new program, record its name.
+            # It is in the first argument in quotes.
+            binary_name = syscall_args[0][1:-1]
+            executable[pid] = binary_name
+        elif syscall_name in OPEN_SYSCALLS:
+            # If this is a syscall touching a file, record the path.
+            # We ignore the result code, i.e. record the path even if the
+            # syscall failed to open it.
+            arg_idx = 0
+            if syscall_name == "openat":
+                # openat() can open a file (second arg) relative to a given
+                # folder (first arg). We only support passing AT_FDCWD, ie.
+                # resolve against the current working directory.
+                arg_idx = 1
+                assert(syscall_args[0] == "AT_FDCWD")
+            fname = abs_path(cwd[pid], syscall_args[arg_idx][1:-1])
+            # Record the file and the name of the program which touched it.
+            files.add((fname, executable[pid]))
+    return files
+
+def filter_results(files, root_dir):
+    """Remove paths which are whitelisted from the results."""
+    # Anything in the Hafnium directory is allowed.
+    files = filter(lambda x: not x[0].startswith(root_dir + "/"), files)
+    # Clang puts intermediate files in /tmp.
+    files = filter(lambda x: not x[0].startswith("/tmp/"), files)
+    return list(files)
+
+def main(args):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("root_dir",
+                        help="Root directory of Hafnium, cwd of strace")
+    args, make_args = parser.parse_known_args()
+
+    stdin = map(lambda x: x.strip(), sys.stdin.readlines())
+    stdin = merge_unfinished_lines(stdin)
+    files = get_touched_files(stdin, args.root_dir)
+    files = filter_results(files, args.root_dir)
+    files = sorted(list(files))
+
+    print("\n".join(map(lambda x: "{} ({})".format(x[0], x[1]), files)))
+
+if __name__ == "__main__":
+    main(sys.argv)