blob: d3a61c1d6548bbc327cc389feaef5bb714278e8f [file] [log] [blame]
Minos Galanakis2c824b42025-03-20 09:28:45 +00001#!/usr/bin/env python3
2
3# Copyright The Mbed TLS Contributors
4# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
5
6"""
7This script checks the current state of the source code for minor issues,
8including incorrect file permissions, presence of tabs, non-Unix line endings,
9trailing whitespace, and presence of UTF-8 BOM.
10Note: requires python 3, must be run from Mbed TLS root.
11"""
12
13import argparse
14import codecs
15import inspect
16import logging
17import os
18import re
19import subprocess
20import sys
21try:
22 from typing import FrozenSet, Optional, Pattern # pylint: disable=unused-import
23except ImportError:
24 pass
25
26from mbedtls_framework import build_tree
27
28
29class FileIssueTracker:
30 """Base class for file-wide issue tracking.
31
32 To implement a checker that processes a file as a whole, inherit from
33 this class and implement `check_file_for_issue` and define ``heading``.
34
35 ``suffix_exemptions``: files whose name ends with a string in this set
36 will not be checked.
37
38 ``path_exemptions``: files whose path (relative to the root of the source
39 tree) matches this regular expression will not be checked. This can be
40 ``None`` to match no path. Paths are normalized and converted to ``/``
41 separators before matching.
42
43 ``heading``: human-readable description of the issue
44 """
45
46 suffix_exemptions = frozenset() #type: FrozenSet[str]
47 path_exemptions = None #type: Optional[Pattern[str]]
48 # heading must be defined in derived classes.
49 # pylint: disable=no-member
50
51 def __init__(self):
52 self.files_with_issues = {}
53
54 @staticmethod
55 def normalize_path(filepath):
56 """Normalize ``filepath`` with / as the directory separator."""
57 filepath = os.path.normpath(filepath)
58 # On Windows, we may have backslashes to separate directories.
59 # We need slashes to match exemption lists.
60 seps = os.path.sep
61 if os.path.altsep is not None:
62 seps += os.path.altsep
63 return '/'.join(filepath.split(seps))
64
65 def should_check_file(self, filepath):
66 """Whether the given file name should be checked.
67
68 Files whose name ends with a string listed in ``self.suffix_exemptions``
69 or whose path matches ``self.path_exemptions`` will not be checked.
70 """
71 for files_exemption in self.suffix_exemptions:
72 if filepath.endswith(files_exemption):
73 return False
74 if self.path_exemptions and \
75 re.match(self.path_exemptions, self.normalize_path(filepath)):
76 return False
77 return True
78
79 def check_file_for_issue(self, filepath):
80 """Check the specified file for the issue that this class is for.
81
82 Subclasses must implement this method.
83 """
84 raise NotImplementedError
85
86 def record_issue(self, filepath, line_number):
87 """Record that an issue was found at the specified location."""
88 if filepath not in self.files_with_issues.keys():
89 self.files_with_issues[filepath] = []
90 self.files_with_issues[filepath].append(line_number)
91
92 def output_file_issues(self, logger):
93 """Log all the locations where the issue was found."""
94 if self.files_with_issues.values():
95 logger.info(self.heading)
96 for filename, lines in sorted(self.files_with_issues.items()):
97 if lines:
98 logger.info("{}: {}".format(
99 filename, ", ".join(str(x) for x in lines)
100 ))
101 else:
102 logger.info(filename)
103 logger.info("")
104
105BINARY_FILE_PATH_RE_LIST = [
106 r'docs/.*\.pdf\Z',
107 r'docs/.*\.png\Z',
108 r'tf-psa-crypto/docs/.*\.pdf\Z',
109 r'tf-psa-crypto/docs/.*\.png\Z',
110 r'programs/fuzz/corpuses/[^.]+\Z',
111 r'framework/data_files/[^.]+\Z',
112 r'framework/data_files/.*\.(crt|csr|db|der|key|pubkey)\Z',
113 r'framework/data_files/.*\.req\.[^/]+\Z',
114 r'framework/data_files/.*malformed[^/]+\Z',
115 r'framework/data_files/format_pkcs12\.fmt\Z',
116 r'framework/data_files/.*\.bin\Z',
117]
118BINARY_FILE_PATH_RE = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST))
119
120class LineIssueTracker(FileIssueTracker):
121 """Base class for line-by-line issue tracking.
122
123 To implement a checker that processes files line by line, inherit from
124 this class and implement `line_with_issue`.
125 """
126
127 # Exclude binary files.
128 path_exemptions = BINARY_FILE_PATH_RE
129
130 def issue_with_line(self, line, filepath, line_number):
131 """Check the specified line for the issue that this class is for.
132
133 Subclasses must implement this method.
134 """
135 raise NotImplementedError
136
137 def check_file_line(self, filepath, line, line_number):
138 if self.issue_with_line(line, filepath, line_number):
139 self.record_issue(filepath, line_number)
140
141 def check_file_for_issue(self, filepath):
142 """Check the lines of the specified file.
143
144 Subclasses must implement the ``issue_with_line`` method.
145 """
146 with open(filepath, "rb") as f:
147 for i, line in enumerate(iter(f.readline, b"")):
148 self.check_file_line(filepath, line, i + 1)
149
150
151def is_windows_file(filepath):
152 _root, ext = os.path.splitext(filepath)
153 return ext in ('.bat', '.dsp', '.dsw', '.sln', '.vcxproj')
154
155
156class ShebangIssueTracker(FileIssueTracker):
157 """Track files with a bad, missing or extraneous shebang line.
158
159 Executable scripts must start with a valid shebang (#!) line.
160 """
161
162 heading = "Invalid shebang line:"
163
164 # Allow either /bin/sh, /bin/bash, or /usr/bin/env.
165 # Allow at most one argument (this is a Linux limitation).
166 # For sh and bash, the argument if present must be options.
167 # For env, the argument must be the base name of the interpreter.
168 _shebang_re = re.compile(rb'^#! ?(?:/bin/(bash|sh)(?: -[^\n ]*)?'
169 rb'|/usr/bin/env ([^\n /]+))$')
170 _extensions = {
171 b'bash': 'sh',
172 b'perl': 'pl',
173 b'python3': 'py',
174 b'sh': 'sh',
175 }
176
177 path_exemptions = re.compile(r'framework/scripts/quiet/.*')
178
179 def is_valid_shebang(self, first_line, filepath):
180 m = re.match(self._shebang_re, first_line)
181 if not m:
182 return False
183 interpreter = m.group(1) or m.group(2)
184 if interpreter not in self._extensions:
185 return False
186 if not filepath.endswith('.' + self._extensions[interpreter]):
187 return False
188 return True
189
190 def check_file_for_issue(self, filepath):
191 is_executable = os.access(filepath, os.X_OK)
192 with open(filepath, "rb") as f:
193 first_line = f.readline()
194 if first_line.startswith(b'#!'):
195 if not is_executable:
196 # Shebang on a non-executable file
197 self.files_with_issues[filepath] = None
198 elif not self.is_valid_shebang(first_line, filepath):
199 self.files_with_issues[filepath] = [1]
200 elif is_executable:
201 # Executable without a shebang
202 self.files_with_issues[filepath] = None
203
204
205class EndOfFileNewlineIssueTracker(FileIssueTracker):
206 """Track files that end with an incomplete line
207 (no newline character at the end of the last line)."""
208
209 heading = "Missing newline at end of file:"
210
211 path_exemptions = BINARY_FILE_PATH_RE
212
213 def check_file_for_issue(self, filepath):
214 with open(filepath, "rb") as f:
215 try:
216 f.seek(-1, 2)
217 except OSError:
218 # This script only works on regular files. If we can't seek
219 # 1 before the end, it means that this position is before
220 # the beginning of the file, i.e. that the file is empty.
221 return
222 if f.read(1) != b"\n":
223 self.files_with_issues[filepath] = None
224
225
226class Utf8BomIssueTracker(FileIssueTracker):
227 """Track files that start with a UTF-8 BOM.
228 Files should be ASCII or UTF-8. Valid UTF-8 does not start with a BOM."""
229
230 heading = "UTF-8 BOM present:"
231
232 suffix_exemptions = frozenset([".vcxproj", ".sln"])
233 path_exemptions = BINARY_FILE_PATH_RE
234
235 def check_file_for_issue(self, filepath):
236 with open(filepath, "rb") as f:
237 if f.read().startswith(codecs.BOM_UTF8):
238 self.files_with_issues[filepath] = None
239
240
241class UnicodeIssueTracker(LineIssueTracker):
242 """Track lines with invalid characters or invalid text encoding."""
243
244 heading = "Invalid UTF-8 or forbidden character:"
245
246 # Only allow valid UTF-8, and only other explicitly allowed characters.
247 # We deliberately exclude all characters that aren't a simple non-blank,
248 # non-zero-width glyph, apart from a very small set (tab, ordinary space,
249 # line breaks, "basic" no-break space and soft hyphen). In particular,
250 # non-ASCII control characters, combinig characters, and Unicode state
251 # changes (e.g. right-to-left text) are forbidden.
252 # Note that we do allow some characters with a risk of visual confusion,
253 # for example '-' (U+002D HYPHEN-MINUS) vs '­' (U+00AD SOFT HYPHEN) vs
254 # '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs
255 # 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA).
256 GOOD_CHARACTERS = ''.join([
257 '\t\n\r -~', # ASCII (tabs and line endings are checked separately)
258 '\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation)
259 '\u2010-\u2027\u2030-\u205E', # General Punctuation (printable)
260 '\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts
261 '\u2190-\u21FF', # Arrows
262 '\u2200-\u22FF', # Mathematical Symbols
263 '\u2500-\u257F' # Box Drawings characters used in markdown trees
264 ])
265 # Allow any of the characters and ranges above, and anything classified
266 # as a word constituent.
267 GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS))
268
269 def issue_with_line(self, line, _filepath, line_number):
270 try:
271 text = line.decode('utf-8')
272 except UnicodeDecodeError:
273 return True
274 if line_number == 1 and text.startswith('\uFEFF'):
275 # Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning.
276 # Which files are allowed to have a BOM is handled in
277 # Utf8BomIssueTracker.
278 text = text[1:]
279 return not self.GOOD_CHARACTERS_RE.match(text)
280
281class UnixLineEndingIssueTracker(LineIssueTracker):
282 """Track files with non-Unix line endings (i.e. files with CR)."""
283
284 heading = "Non-Unix line endings:"
285
286 def should_check_file(self, filepath):
287 if not super().should_check_file(filepath):
288 return False
289 return not is_windows_file(filepath)
290
291 def issue_with_line(self, line, _filepath, _line_number):
292 return b"\r" in line
293
294
295class WindowsLineEndingIssueTracker(LineIssueTracker):
296 """Track files with non-Windows line endings (i.e. CR or LF not in CRLF)."""
297
298 heading = "Non-Windows line endings:"
299
300 def should_check_file(self, filepath):
301 if not super().should_check_file(filepath):
302 return False
303 return is_windows_file(filepath)
304
305 def issue_with_line(self, line, _filepath, _line_number):
306 return not line.endswith(b"\r\n") or b"\r" in line[:-2]
307
308
309class TrailingWhitespaceIssueTracker(LineIssueTracker):
310 """Track lines with trailing whitespace."""
311
312 heading = "Trailing whitespace:"
313 suffix_exemptions = frozenset([".dsp", ".md"])
314
315 def issue_with_line(self, line, _filepath, _line_number):
316 return line.rstrip(b"\r\n") != line.rstrip()
317
318
319class TabIssueTracker(LineIssueTracker):
320 """Track lines with tabs."""
321
322 heading = "Tabs present:"
323 suffix_exemptions = frozenset([
324 ".make",
325 ".pem", # some openssl dumps have tabs
326 ".sln",
327 "/.gitmodules",
328 "/Makefile",
329 "/Makefile.inc",
330 "/generate_visualc_files.pl",
331 ])
332
333 def issue_with_line(self, line, _filepath, _line_number):
334 return b"\t" in line
335
336
337class MergeArtifactIssueTracker(LineIssueTracker):
338 """Track lines with merge artifacts.
339 These are leftovers from a ``git merge`` that wasn't fully edited."""
340
341 heading = "Merge artifact:"
342
343 def issue_with_line(self, line, _filepath, _line_number):
344 # Detect leftover git conflict markers.
345 if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '):
346 return True
347 if line.startswith(b'||||||| '): # from merge.conflictStyle=diff3
348 return True
349 if line.rstrip(b'\r\n') == b'=======' and \
350 not _filepath.endswith('.md'):
351 return True
352 return False
353
354
355def this_location():
356 frame = inspect.currentframe()
357 assert frame is not None
358 info = inspect.getframeinfo(frame)
359 return os.path.basename(info.filename), info.lineno
360THIS_FILE_BASE_NAME, LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER = this_location()
361
362class LicenseIssueTracker(LineIssueTracker):
363 """Check copyright statements and license indications.
364
365 This class only checks that statements are correct if present. It does
366 not enforce the presence of statements in each file.
367 """
368
369 heading = "License issue:"
370
371 LICENSE_EXEMPTION_RE_LIST = []
372
373 # Exempt third-party drivers which may be under a different license
374 if build_tree.looks_like_tf_psa_crypto_root(os.getcwd()):
375 LICENSE_EXEMPTION_RE_LIST.append(r'drivers/(?=(everest)/.*)')
376 elif build_tree.is_mbedtls_3_6():
377 LICENSE_EXEMPTION_RE_LIST.append(r'3rdparty/(?!(p256-m)/.*)')
378
379 LICENSE_EXEMPTION_RE_LIST += [
380 # Documentation explaining the license may have accidental
381 # false positives.
382 r'(ChangeLog|LICENSE|framework\/LICENSE|[-0-9A-Z_a-z]+\.md)\Z',
383 # Files imported from TF-M, and not used except in test builds,
384 # may be under a different license.
385 r'configs/ext/crypto_config_profile_medium\.h\Z',
386 r'configs/ext/tfm_mbedcrypto_config_profile_medium\.h\Z',
387 r'configs/ext/README\.md\Z',
388 # Third-party file.
389 r'dco\.txt\Z',
390 r'framework\/dco\.txt\Z',
391 ]
392 path_exemptions = re.compile('|'.join(BINARY_FILE_PATH_RE_LIST +
393 LICENSE_EXEMPTION_RE_LIST))
394
395 COPYRIGHT_HOLDER = rb'The Mbed TLS Contributors'
396 # Catch "Copyright foo", "Copyright (C) foo", "Copyright © foo", etc.
397 COPYRIGHT_RE = re.compile(rb'.*\bcopyright\s+((?:\w|\s|[()]|[^ -~])*\w)', re.I)
398
399 SPDX_HEADER_KEY = b'SPDX-License-Identifier'
400 LICENSE_IDENTIFIER = b'Apache-2.0 OR GPL-2.0-or-later'
401 SPDX_RE = re.compile(br'.*?(' +
402 re.escape(SPDX_HEADER_KEY) +
403 br')(:\s*(.*?)\W*\Z|.*)', re.I)
404
405 LICENSE_MENTION_RE = re.compile(rb'.*(?:' + rb'|'.join([
406 rb'Apache License',
407 rb'General Public License',
408 ]) + rb')', re.I)
409
410 def __init__(self):
411 super().__init__()
412 # Record what problem was caused. We can't easily report it due to
413 # the structure of the script. To be fixed after
414 # https://github.com/Mbed-TLS/mbedtls/pull/2506
415 self.problem = None
416
417 def issue_with_line(self, line, filepath, line_number):
418 #pylint: disable=too-many-return-statements
419
420 # Use endswith() rather than the more correct os.path.basename()
421 # because experimentally, it makes a significant difference to
422 # the running time.
423 if filepath.endswith(THIS_FILE_BASE_NAME) and \
424 line_number > LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER:
425 # Avoid false positives from the code in this class.
426 # Also skip the rest of this file, which is highly unlikely to
427 # contain any problematic statements since we put those near the
428 # top of files.
429 return False
430
431 m = self.COPYRIGHT_RE.match(line)
432 if m and m.group(1) != self.COPYRIGHT_HOLDER:
433 self.problem = 'Invalid copyright line'
434 return True
435
436 m = self.SPDX_RE.match(line)
437 if m:
438 if m.group(1) != self.SPDX_HEADER_KEY:
439 self.problem = 'Misspelled ' + self.SPDX_HEADER_KEY.decode()
440 return True
441 if not m.group(3):
442 self.problem = 'Improperly formatted SPDX license identifier'
443 return True
444 if m.group(3) != self.LICENSE_IDENTIFIER:
445 self.problem = 'Wrong SPDX license identifier'
446 return True
447
448 m = self.LICENSE_MENTION_RE.match(line)
449 if m:
450 self.problem = 'Suspicious license mention'
451 return True
452
453 return False
454
455
456class ErrorAddIssueTracker(LineIssueTracker):
457 """Signal direct additions of error codes.
458
459 Adding a low-level error code with a high-level error code is deprecated
460 and should use MBEDTLS_ERROR_ADD.
461 """
462
463 heading = "Direct addition of error codes"
464
465 _ERR_PLUS_RE = re.compile(br'MBEDTLS_ERR_\w+ *\+|'
466 br'\+ *MBEDTLS_ERR_')
467 _EXCLUDE_RE = re.compile(br' *case ')
468
469 def issue_with_line(self, line, filepath, line_number):
470 if self._ERR_PLUS_RE.search(line) and not self._EXCLUDE_RE.match(line):
471 return True
472 return False
473
474
475class IntegrityChecker:
476 """Sanity-check files under the current directory."""
477
478 def __init__(self, log_file):
479 """Instantiate the sanity checker.
480 Check files under the current directory.
481 Write a report of issues to log_file."""
482 if not build_tree.looks_like_root(os.getcwd()):
483 raise Exception("This script must be run from Mbed TLS or TF-PSA-Crypto root")
484 self.logger = None
485 self.setup_logger(log_file)
486 self.issues_to_check = [
487 ShebangIssueTracker(),
488 EndOfFileNewlineIssueTracker(),
489 Utf8BomIssueTracker(),
490 UnicodeIssueTracker(),
491 UnixLineEndingIssueTracker(),
492 WindowsLineEndingIssueTracker(),
493 TrailingWhitespaceIssueTracker(),
494 TabIssueTracker(),
495 MergeArtifactIssueTracker(),
496 LicenseIssueTracker(),
497 ]
498
499 if not build_tree.is_mbedtls_3_6():
500 self.issues_to_check.append(ErrorAddIssueTracker())
501
502 def setup_logger(self, log_file, level=logging.INFO):
503 """Log to log_file if provided, or to stderr if None."""
504 self.logger = logging.getLogger()
505 self.logger.setLevel(level)
506 if log_file:
507 handler = logging.FileHandler(log_file)
508 self.logger.addHandler(handler)
509 else:
510 console = logging.StreamHandler()
511 self.logger.addHandler(console)
512
513 @staticmethod
514 def collect_files():
515 """Return the list of files to check.
516
517 These are the regular files commited into Git.
518 """
519 bytes_output = subprocess.check_output(['git', '-C', 'framework',
520 'ls-files', '-z'])
521 bytes_framework_filepaths = bytes_output.split(b'\0')[:-1]
522 bytes_framework_filepaths = ["framework/".encode() + filepath
523 for filepath in bytes_framework_filepaths]
524
525 bytes_output = subprocess.check_output(['git', 'ls-files', '-z'])
526 bytes_filepaths = bytes_output.split(b'\0')[:-1] + \
527 bytes_framework_filepaths
528 ascii_filepaths = map(lambda fp: fp.decode('ascii'), bytes_filepaths)
529
530 # Filter out directories. Normally Git doesn't list directories
531 # (it only knows about the files inside them), but there is
532 # at least one case where 'git ls-files' includes a directory:
533 # submodules. Just skip submodules (and any other directories).
534 ascii_filepaths = [fp for fp in ascii_filepaths
535 if os.path.isfile(fp)]
536 # Prepend './' to files in the top-level directory so that
537 # something like `'/Makefile' in fp` matches in the top-level
538 # directory as well as in subdirectories.
539 return [fp if os.path.dirname(fp) else os.path.join(os.curdir, fp)
540 for fp in ascii_filepaths]
541
542 def check_files(self):
543 """Check all files for all issues."""
544 for issue_to_check in self.issues_to_check:
545 for filepath in self.collect_files():
546 if issue_to_check.should_check_file(filepath):
547 issue_to_check.check_file_for_issue(filepath)
548
549 def output_issues(self):
550 """Log the issues found and their locations.
551
552 Return 1 if there were issues, 0 otherwise.
553 """
554 integrity_return_code = 0
555 for issue_to_check in self.issues_to_check:
556 if issue_to_check.files_with_issues:
557 integrity_return_code = 1
558 issue_to_check.output_file_issues(self.logger)
559 return integrity_return_code
560
561
562def run_main():
563 parser = argparse.ArgumentParser(description=__doc__)
564 parser.add_argument(
565 "-l", "--log_file", type=str, help="path to optional output log",
566 )
567 check_args = parser.parse_args()
568 integrity_check = IntegrityChecker(check_args.log_file)
569 integrity_check.check_files()
570 return_code = integrity_check.output_issues()
571 sys.exit(return_code)
572
573
574if __name__ == "__main__":
575 run_main()