Merge pull request #8470 from gilles-peskine-arm/check-files-license

Check copyright statements and SPDX license statements
diff --git a/tests/scripts/check_files.py b/tests/scripts/check_files.py
index 68871ef..a2a9dfa 100755
--- a/tests/scripts/check_files.py
+++ b/tests/scripts/check_files.py
@@ -10,10 +10,11 @@
 Note: requires python 3, must be run from Mbed TLS root.
 """
 
-import os
 import argparse
-import logging
 import codecs
+import inspect
+import logging
+import os
 import re
 import subprocess
 import sys
@@ -345,6 +346,100 @@
         return False
 
 
+def this_location():
+    frame = inspect.currentframe()
+    assert frame is not None
+    info = inspect.getframeinfo(frame)
+    return os.path.basename(info.filename), info.lineno
+THIS_FILE_BASE_NAME, LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER = this_location()
+
+class LicenseIssueTracker(LineIssueTracker):
+    """Check copyright statements and license indications.
+
+    This class only checks that statements are correct if present. It does
+    not enforce the presence of statements in each file.
+    """
+
+    heading = "License issue:"
+
+    LICENSE_EXEMPTION_RE_LIST = [
+        # Third-party code, other than whitelisted third-party modules,
+        # may be under a different license.
+        r'3rdparty/(?!(p256-m)/.*)',
+        # Documentation explaining the license may have accidental
+        # false positives.
+        r'(ChangeLog|LICENSE|[-0-9A-Z_a-z]+\.md)\Z',
+        # Files imported from TF-M, and not used except in test builds,
+        # may be under a different license.
+        r'configs/crypto_config_profile_medium\.h\Z',
+        r'configs/tfm_mbedcrypto_config_profile_medium\.h\Z',
+        # Third-party file.
+        r'dco\.txt\Z',
+    ]
+    path_exemptions = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST +
+                                          LICENSE_EXEMPTION_RE_LIST))
+
+    COPYRIGHT_HOLDER = rb'The Mbed TLS Contributors'
+    # Catch "Copyright foo", "Copyright (C) foo", "Copyright © foo", etc.
+    COPYRIGHT_RE = re.compile(rb'.*\bcopyright\s+((?:\w|\s|[()]|[^ -~])*\w)', re.I)
+
+    SPDX_HEADER_KEY = b'SPDX-License-Identifier'
+    LICENSE_IDENTIFIER = b'Apache-2.0 OR GPL-2.0-or-later'
+    SPDX_RE = re.compile(br'.*?(' +
+                         re.escape(SPDX_HEADER_KEY) +
+                         br')(:\s*(.*?)\W*\Z|.*)', re.I)
+
+    LICENSE_MENTION_RE = re.compile(rb'.*(?:' + rb'|'.join([
+        rb'Apache License',
+        rb'General Public License',
+    ]) + rb')', re.I)
+
+    def __init__(self):
+        super().__init__()
+        # Record what problem was caused. We can't easily report it due to
+        # the structure of the script. To be fixed after
+        # https://github.com/Mbed-TLS/mbedtls/pull/2506
+        self.problem = None
+
+    def issue_with_line(self, line, filepath, line_number):
+        #pylint: disable=too-many-return-statements
+
+        # Use endswith() rather than the more correct os.path.basename()
+        # because experimentally, it makes a significant difference to
+        # the running time.
+        if filepath.endswith(THIS_FILE_BASE_NAME) and \
+           line_number > LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER:
+            # Avoid false positives from the code in this class.
+            # Also skip the rest of this file, which is highly unlikely to
+            # contain any problematic statements since we put those near the
+            # top of files.
+            return False
+
+        m = self.COPYRIGHT_RE.match(line)
+        if m and m.group(1) != self.COPYRIGHT_HOLDER:
+            self.problem = 'Invalid copyright line'
+            return True
+
+        m = self.SPDX_RE.match(line)
+        if m:
+            if m.group(1) != self.SPDX_HEADER_KEY:
+                self.problem = 'Misspelled ' + self.SPDX_HEADER_KEY.decode()
+                return True
+            if not m.group(3):
+                self.problem = 'Improperly formatted SPDX license identifier'
+                return True
+            if m.group(3) != self.LICENSE_IDENTIFIER:
+                self.problem = 'Wrong SPDX license identifier'
+                return True
+
+        m = self.LICENSE_MENTION_RE.match(line)
+        if m:
+            self.problem = 'Suspicious license mention'
+            return True
+
+        return False
+
+
 class IntegrityChecker:
     """Sanity-check files under the current directory."""
 
@@ -365,6 +460,7 @@
             TrailingWhitespaceIssueTracker(),
             TabIssueTracker(),
             MergeArtifactIssueTracker(),
+            LicenseIssueTracker(),
         ]
 
     def setup_logger(self, log_file, level=logging.INFO):