Blame - framework/scripts/check_files.py - mirror/mbed-tls

blob: d3a61c1d6548bbc327cc389feaef5bb714278e8f [file] [log] [blame]

Minos Galanakis	2c824b4	2025-03-20 09:28:45 +0000	[diff] [blame^]	1	#!/usr/bin/env python3
				2
				3	# Copyright The Mbed TLS Contributors
				4	# SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
				5
				6	"""
				7	This script checks the current state of the source code for minor issues,
				8	including incorrect file permissions, presence of tabs, non-Unix line endings,
				9	trailing whitespace, and presence of UTF-8 BOM.
				10	Note: requires python 3, must be run from Mbed TLS root.
				11	"""
				12
				13	import argparse
				14	import codecs
				15	import inspect
				16	import logging
				17	import os
				18	import re
				19	import subprocess
				20	import sys
				21	try:
				22	from typing import FrozenSet, Optional, Pattern # pylint: disable=unused-import
				23	except ImportError:
				24	pass
				25
				26	from mbedtls_framework import build_tree
				27
				28
				29	class FileIssueTracker:
				30	"""Base class for file-wide issue tracking.
				31
				32	To implement a checker that processes a file as a whole, inherit from
				33	this class and implement `check_file_for_issue` and define ``heading``.
				34
				35	``suffix_exemptions``: files whose name ends with a string in this set
				36	will not be checked.
				37
				38	``path_exemptions``: files whose path (relative to the root of the source
				39	tree) matches this regular expression will not be checked. This can be
				40	``None`` to match no path. Paths are normalized and converted to ``/``
				41	separators before matching.
				42
				43	``heading``: human-readable description of the issue
				44	"""
				45
				46	suffix_exemptions = frozenset() #type: FrozenSet[str]
				47	path_exemptions = None #type: Optional[Pattern[str]]
				48	# heading must be defined in derived classes.
				49	# pylint: disable=no-member
				50
				51	def __init__(self):
				52	self.files_with_issues = {}
				53
				54	@staticmethod
				55	def normalize_path(filepath):
				56	"""Normalize ``filepath`` with / as the directory separator."""
				57	filepath = os.path.normpath(filepath)
				58	# On Windows, we may have backslashes to separate directories.
				59	# We need slashes to match exemption lists.
				60	seps = os.path.sep
				61	if os.path.altsep is not None:
				62	seps += os.path.altsep
				63	return '/'.join(filepath.split(seps))
				64
				65	def should_check_file(self, filepath):
				66	"""Whether the given file name should be checked.
				67
				68	Files whose name ends with a string listed in ``self.suffix_exemptions``
				69	or whose path matches ``self.path_exemptions`` will not be checked.
				70	"""
				71	for files_exemption in self.suffix_exemptions:
				72	if filepath.endswith(files_exemption):
				73	return False
				74	if self.path_exemptions and \
				75	re.match(self.path_exemptions, self.normalize_path(filepath)):
				76	return False
				77	return True
				78
				79	def check_file_for_issue(self, filepath):
				80	"""Check the specified file for the issue that this class is for.
				81
				82	Subclasses must implement this method.
				83	"""
				84	raise NotImplementedError
				85
				86	def record_issue(self, filepath, line_number):
				87	"""Record that an issue was found at the specified location."""
				88	if filepath not in self.files_with_issues.keys():
				89	self.files_with_issues[filepath] = []
				90	self.files_with_issues[filepath].append(line_number)
				91
				92	def output_file_issues(self, logger):
				93	"""Log all the locations where the issue was found."""
				94	if self.files_with_issues.values():
				95	logger.info(self.heading)
				96	for filename, lines in sorted(self.files_with_issues.items()):
				97	if lines:
				98	logger.info("{}: {}".format(
				99	filename, ", ".join(str(x) for x in lines)
				100	))
				101	else:
				102	logger.info(filename)
				103	logger.info("")
				104
				105	BINARY_FILE_PATH_RE_LIST = [
				106	r'docs/.*\.pdf\Z',
				107	r'docs/.*\.png\Z',
				108	r'tf-psa-crypto/docs/.*\.pdf\Z',
				109	r'tf-psa-crypto/docs/.*\.png\Z',
				110	r'programs/fuzz/corpuses/[^.]+\Z',
				111	r'framework/data_files/[^.]+\Z',
				112	r'framework/data_files/.*\.(crt\|csr\|db\|der\|key\|pubkey)\Z',
				113	r'framework/data_files/.*\.req\.[^/]+\Z',
				114	r'framework/data_files/.*malformed[^/]+\Z',
				115	r'framework/data_files/format_pkcs12\.fmt\Z',
				116	r'framework/data_files/.*\.bin\Z',
				117	]
				118	BINARY_FILE_PATH_RE = re.compile('\|'.join(BINARY_FILE_PATH_RE_LIST))
				119
				120	class LineIssueTracker(FileIssueTracker):
				121	"""Base class for line-by-line issue tracking.
				122
				123	To implement a checker that processes files line by line, inherit from
				124	this class and implement `line_with_issue`.
				125	"""
				126
				127	# Exclude binary files.
				128	path_exemptions = BINARY_FILE_PATH_RE
				129
				130	def issue_with_line(self, line, filepath, line_number):
				131	"""Check the specified line for the issue that this class is for.
				132
				133	Subclasses must implement this method.
				134	"""
				135	raise NotImplementedError
				136
				137	def check_file_line(self, filepath, line, line_number):
				138	if self.issue_with_line(line, filepath, line_number):
				139	self.record_issue(filepath, line_number)
				140
				141	def check_file_for_issue(self, filepath):
				142	"""Check the lines of the specified file.
				143
				144	Subclasses must implement the ``issue_with_line`` method.
				145	"""
				146	with open(filepath, "rb") as f:
				147	for i, line in enumerate(iter(f.readline, b"")):
				148	self.check_file_line(filepath, line, i + 1)
				149
				150
				151	def is_windows_file(filepath):
				152	_root, ext = os.path.splitext(filepath)
				153	return ext in ('.bat', '.dsp', '.dsw', '.sln', '.vcxproj')
				154
				155
				156	class ShebangIssueTracker(FileIssueTracker):
				157	"""Track files with a bad, missing or extraneous shebang line.
				158
				159	Executable scripts must start with a valid shebang (#!) line.
				160	"""
				161
				162	heading = "Invalid shebang line:"
				163
				164	# Allow either /bin/sh, /bin/bash, or /usr/bin/env.
				165	# Allow at most one argument (this is a Linux limitation).
				166	# For sh and bash, the argument if present must be options.
				167	# For env, the argument must be the base name of the interpreter.
				168	_shebang_re = re.compile(rb'^#! ?(?:/bin/(bash\|sh)(?: -[^\n ]*)?'
				169	rb'\|/usr/bin/env ([^\n /]+))$')
				170	_extensions = {
				171	b'bash': 'sh',
				172	b'perl': 'pl',
				173	b'python3': 'py',
				174	b'sh': 'sh',
				175	}
				176
				177	path_exemptions = re.compile(r'framework/scripts/quiet/.*')
				178
				179	def is_valid_shebang(self, first_line, filepath):
				180	m = re.match(self._shebang_re, first_line)
				181	if not m:
				182	return False
				183	interpreter = m.group(1) or m.group(2)
				184	if interpreter not in self._extensions:
				185	return False
				186	if not filepath.endswith('.' + self._extensions[interpreter]):
				187	return False
				188	return True
				189
				190	def check_file_for_issue(self, filepath):
				191	is_executable = os.access(filepath, os.X_OK)
				192	with open(filepath, "rb") as f:
				193	first_line = f.readline()
				194	if first_line.startswith(b'#!'):
				195	if not is_executable:
				196	# Shebang on a non-executable file
				197	self.files_with_issues[filepath] = None
				198	elif not self.is_valid_shebang(first_line, filepath):
				199	self.files_with_issues[filepath] = [1]
				200	elif is_executable:
				201	# Executable without a shebang
				202	self.files_with_issues[filepath] = None
				203
				204
				205	class EndOfFileNewlineIssueTracker(FileIssueTracker):
				206	"""Track files that end with an incomplete line
				207	(no newline character at the end of the last line)."""
				208
				209	heading = "Missing newline at end of file:"
				210
				211	path_exemptions = BINARY_FILE_PATH_RE
				212
				213	def check_file_for_issue(self, filepath):
				214	with open(filepath, "rb") as f:
				215	try:
				216	f.seek(-1, 2)
				217	except OSError:
				218	# This script only works on regular files. If we can't seek
				219	# 1 before the end, it means that this position is before
				220	# the beginning of the file, i.e. that the file is empty.
				221	return
				222	if f.read(1) != b"\n":
				223	self.files_with_issues[filepath] = None
				224
				225
				226	class Utf8BomIssueTracker(FileIssueTracker):
				227	"""Track files that start with a UTF-8 BOM.
				228	Files should be ASCII or UTF-8. Valid UTF-8 does not start with a BOM."""
				229
				230	heading = "UTF-8 BOM present:"
				231
				232	suffix_exemptions = frozenset([".vcxproj", ".sln"])
				233	path_exemptions = BINARY_FILE_PATH_RE
				234
				235	def check_file_for_issue(self, filepath):
				236	with open(filepath, "rb") as f:
				237	if f.read().startswith(codecs.BOM_UTF8):
				238	self.files_with_issues[filepath] = None
				239
				240
				241	class UnicodeIssueTracker(LineIssueTracker):
				242	"""Track lines with invalid characters or invalid text encoding."""
				243
				244	heading = "Invalid UTF-8 or forbidden character:"
				245
				246	# Only allow valid UTF-8, and only other explicitly allowed characters.
				247	# We deliberately exclude all characters that aren't a simple non-blank,
				248	# non-zero-width glyph, apart from a very small set (tab, ordinary space,
				249	# line breaks, "basic" no-break space and soft hyphen). In particular,
				250	# non-ASCII control characters, combinig characters, and Unicode state
				251	# changes (e.g. right-to-left text) are forbidden.
				252	# Note that we do allow some characters with a risk of visual confusion,
				253	# for example '-' (U+002D HYPHEN-MINUS) vs '' (U+00AD SOFT HYPHEN) vs
				254	# '‐' (U+2010 HYPHEN), or 'A' (U+0041 LATIN CAPITAL LETTER A) vs
				255	# 'Α' (U+0391 GREEK CAPITAL LETTER ALPHA).
				256	GOOD_CHARACTERS = ''.join([
				257	'\t\n\r -~', # ASCII (tabs and line endings are checked separately)
				258	'\u00A0-\u00FF', # Latin-1 Supplement (for NO-BREAK SPACE and punctuation)
				259	'\u2010-\u2027\u2030-\u205E', # General Punctuation (printable)
				260	'\u2070\u2071\u2074-\u208E\u2090-\u209C', # Superscripts and Subscripts
				261	'\u2190-\u21FF', # Arrows
				262	'\u2200-\u22FF', # Mathematical Symbols
				263	'\u2500-\u257F' # Box Drawings characters used in markdown trees
				264	])
				265	# Allow any of the characters and ranges above, and anything classified
				266	# as a word constituent.
				267	GOOD_CHARACTERS_RE = re.compile(r'[\w{}]+\Z'.format(GOOD_CHARACTERS))
				268
				269	def issue_with_line(self, line, _filepath, line_number):
				270	try:
				271	text = line.decode('utf-8')
				272	except UnicodeDecodeError:
				273	return True
				274	if line_number == 1 and text.startswith('\uFEFF'):
				275	# Strip BOM (U+FEFF ZERO WIDTH NO-BREAK SPACE) at the beginning.
				276	# Which files are allowed to have a BOM is handled in
				277	# Utf8BomIssueTracker.
				278	text = text[1:]
				279	return not self.GOOD_CHARACTERS_RE.match(text)
				280
				281	class UnixLineEndingIssueTracker(LineIssueTracker):
				282	"""Track files with non-Unix line endings (i.e. files with CR)."""
				283
				284	heading = "Non-Unix line endings:"
				285
				286	def should_check_file(self, filepath):
				287	if not super().should_check_file(filepath):
				288	return False
				289	return not is_windows_file(filepath)
				290
				291	def issue_with_line(self, line, _filepath, _line_number):
				292	return b"\r" in line
				293
				294
				295	class WindowsLineEndingIssueTracker(LineIssueTracker):
				296	"""Track files with non-Windows line endings (i.e. CR or LF not in CRLF)."""
				297
				298	heading = "Non-Windows line endings:"
				299
				300	def should_check_file(self, filepath):
				301	if not super().should_check_file(filepath):
				302	return False
				303	return is_windows_file(filepath)
				304
				305	def issue_with_line(self, line, _filepath, _line_number):
				306	return not line.endswith(b"\r\n") or b"\r" in line[:-2]
				307
				308
				309	class TrailingWhitespaceIssueTracker(LineIssueTracker):
				310	"""Track lines with trailing whitespace."""
				311
				312	heading = "Trailing whitespace:"
				313	suffix_exemptions = frozenset([".dsp", ".md"])
				314
				315	def issue_with_line(self, line, _filepath, _line_number):
				316	return line.rstrip(b"\r\n") != line.rstrip()
				317
				318
				319	class TabIssueTracker(LineIssueTracker):
				320	"""Track lines with tabs."""
				321
				322	heading = "Tabs present:"
				323	suffix_exemptions = frozenset([
				324	".make",
				325	".pem", # some openssl dumps have tabs
				326	".sln",
				327	"/.gitmodules",
				328	"/Makefile",
				329	"/Makefile.inc",
				330	"/generate_visualc_files.pl",
				331	])
				332
				333	def issue_with_line(self, line, _filepath, _line_number):
				334	return b"\t" in line
				335
				336
				337	class MergeArtifactIssueTracker(LineIssueTracker):
				338	"""Track lines with merge artifacts.
				339	These are leftovers from a ``git merge`` that wasn't fully edited."""
				340
				341	heading = "Merge artifact:"
				342
				343	def issue_with_line(self, line, _filepath, _line_number):
				344	# Detect leftover git conflict markers.
				345	if line.startswith(b'<<<<<<< ') or line.startswith(b'>>>>>>> '):
				346	return True
				347	if line.startswith(b'\|\|\|\|\|\|\| '): # from merge.conflictStyle=diff3
				348	return True
				349	if line.rstrip(b'\r\n') == b'=======' and \
				350	not _filepath.endswith('.md'):
				351	return True
				352	return False
				353
				354
				355	def this_location():
				356	frame = inspect.currentframe()
				357	assert frame is not None
				358	info = inspect.getframeinfo(frame)
				359	return os.path.basename(info.filename), info.lineno
				360	THIS_FILE_BASE_NAME, LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER = this_location()
				361
				362	class LicenseIssueTracker(LineIssueTracker):
				363	"""Check copyright statements and license indications.
				364
				365	This class only checks that statements are correct if present. It does
				366	not enforce the presence of statements in each file.
				367	"""
				368
				369	heading = "License issue:"
				370
				371	LICENSE_EXEMPTION_RE_LIST = []
				372
				373	# Exempt third-party drivers which may be under a different license
				374	if build_tree.looks_like_tf_psa_crypto_root(os.getcwd()):
				375	LICENSE_EXEMPTION_RE_LIST.append(r'drivers/(?=(everest)/.*)')
				376	elif build_tree.is_mbedtls_3_6():
				377	LICENSE_EXEMPTION_RE_LIST.append(r'3rdparty/(?!(p256-m)/.*)')
				378
				379	LICENSE_EXEMPTION_RE_LIST += [
				380	# Documentation explaining the license may have accidental
				381	# false positives.
				382	r'(ChangeLog\|LICENSE\|framework\/LICENSE\|[-0-9A-Z_a-z]+\.md)\Z',
				383	# Files imported from TF-M, and not used except in test builds,
				384	# may be under a different license.
				385	r'configs/ext/crypto_config_profile_medium\.h\Z',
				386	r'configs/ext/tfm_mbedcrypto_config_profile_medium\.h\Z',
				387	r'configs/ext/README\.md\Z',
				388	# Third-party file.
				389	r'dco\.txt\Z',
				390	r'framework\/dco\.txt\Z',
				391	]
				392	path_exemptions = re.compile('\|'.join(BINARY_FILE_PATH_RE_LIST +
				393	LICENSE_EXEMPTION_RE_LIST))
				394
				395	COPYRIGHT_HOLDER = rb'The Mbed TLS Contributors'
				396	# Catch "Copyright foo", "Copyright (C) foo", "Copyright © foo", etc.
				397	COPYRIGHT_RE = re.compile(rb'.\bcopyright\s+((?:\w\|\s\|[()]\|[^ -~])\w)', re.I)
				398
				399	SPDX_HEADER_KEY = b'SPDX-License-Identifier'
				400	LICENSE_IDENTIFIER = b'Apache-2.0 OR GPL-2.0-or-later'
				401	SPDX_RE = re.compile(br'.*?(' +
				402	re.escape(SPDX_HEADER_KEY) +
				403	br')(:\s(.?)\W\Z\|.)', re.I)
				404
				405	LICENSE_MENTION_RE = re.compile(rb'.*(?:' + rb'\|'.join([
				406	rb'Apache License',
				407	rb'General Public License',
				408	]) + rb')', re.I)
				409
				410	def __init__(self):
				411	super().__init__()
				412	# Record what problem was caused. We can't easily report it due to
				413	# the structure of the script. To be fixed after
				414	# https://github.com/Mbed-TLS/mbedtls/pull/2506
				415	self.problem = None
				416
				417	def issue_with_line(self, line, filepath, line_number):
				418	#pylint: disable=too-many-return-statements
				419
				420	# Use endswith() rather than the more correct os.path.basename()
				421	# because experimentally, it makes a significant difference to
				422	# the running time.
				423	if filepath.endswith(THIS_FILE_BASE_NAME) and \
				424	line_number > LINE_NUMBER_BEFORE_LICENSE_ISSUE_TRACKER:
				425	# Avoid false positives from the code in this class.
				426	# Also skip the rest of this file, which is highly unlikely to
				427	# contain any problematic statements since we put those near the
				428	# top of files.
				429	return False
				430
				431	m = self.COPYRIGHT_RE.match(line)
				432	if m and m.group(1) != self.COPYRIGHT_HOLDER:
				433	self.problem = 'Invalid copyright line'
				434	return True
				435
				436	m = self.SPDX_RE.match(line)
				437	if m:
				438	if m.group(1) != self.SPDX_HEADER_KEY:
				439	self.problem = 'Misspelled ' + self.SPDX_HEADER_KEY.decode()
				440	return True
				441	if not m.group(3):
				442	self.problem = 'Improperly formatted SPDX license identifier'
				443	return True
				444	if m.group(3) != self.LICENSE_IDENTIFIER:
				445	self.problem = 'Wrong SPDX license identifier'
				446	return True
				447
				448	m = self.LICENSE_MENTION_RE.match(line)
				449	if m:
				450	self.problem = 'Suspicious license mention'
				451	return True
				452
				453	return False
				454
				455
				456	class ErrorAddIssueTracker(LineIssueTracker):
				457	"""Signal direct additions of error codes.
				458
				459	Adding a low-level error code with a high-level error code is deprecated
				460	and should use MBEDTLS_ERROR_ADD.
				461	"""
				462
				463	heading = "Direct addition of error codes"
				464
				465	_ERR_PLUS_RE = re.compile(br'MBEDTLS_ERR_\w+ *\+\|'
				466	br'\+ *MBEDTLS_ERR_')
				467	_EXCLUDE_RE = re.compile(br' *case ')
				468
				469	def issue_with_line(self, line, filepath, line_number):
				470	if self._ERR_PLUS_RE.search(line) and not self._EXCLUDE_RE.match(line):
				471	return True
				472	return False
				473
				474
				475	class IntegrityChecker:
				476	"""Sanity-check files under the current directory."""
				477
				478	def __init__(self, log_file):
				479	"""Instantiate the sanity checker.
				480	Check files under the current directory.
				481	Write a report of issues to log_file."""
				482	if not build_tree.looks_like_root(os.getcwd()):
				483	raise Exception("This script must be run from Mbed TLS or TF-PSA-Crypto root")
				484	self.logger = None
				485	self.setup_logger(log_file)
				486	self.issues_to_check = [
				487	ShebangIssueTracker(),
				488	EndOfFileNewlineIssueTracker(),
				489	Utf8BomIssueTracker(),
				490	UnicodeIssueTracker(),
				491	UnixLineEndingIssueTracker(),
				492	WindowsLineEndingIssueTracker(),
				493	TrailingWhitespaceIssueTracker(),
				494	TabIssueTracker(),
				495	MergeArtifactIssueTracker(),
				496	LicenseIssueTracker(),
				497	]
				498
				499	if not build_tree.is_mbedtls_3_6():
				500	self.issues_to_check.append(ErrorAddIssueTracker())
				501
				502	def setup_logger(self, log_file, level=logging.INFO):
				503	"""Log to log_file if provided, or to stderr if None."""
				504	self.logger = logging.getLogger()
				505	self.logger.setLevel(level)
				506	if log_file:
				507	handler = logging.FileHandler(log_file)
				508	self.logger.addHandler(handler)
				509	else:
				510	console = logging.StreamHandler()
				511	self.logger.addHandler(console)
				512
				513	@staticmethod
				514	def collect_files():
				515	"""Return the list of files to check.
				516
				517	These are the regular files commited into Git.
				518	"""
				519	bytes_output = subprocess.check_output(['git', '-C', 'framework',
				520	'ls-files', '-z'])
				521	bytes_framework_filepaths = bytes_output.split(b'\0')[:-1]
				522	bytes_framework_filepaths = ["framework/".encode() + filepath
				523	for filepath in bytes_framework_filepaths]
				524
				525	bytes_output = subprocess.check_output(['git', 'ls-files', '-z'])
				526	bytes_filepaths = bytes_output.split(b'\0')[:-1] + \
				527	bytes_framework_filepaths
				528	ascii_filepaths = map(lambda fp: fp.decode('ascii'), bytes_filepaths)
				529
				530	# Filter out directories. Normally Git doesn't list directories
				531	# (it only knows about the files inside them), but there is
				532	# at least one case where 'git ls-files' includes a directory:
				533	# submodules. Just skip submodules (and any other directories).
				534	ascii_filepaths = [fp for fp in ascii_filepaths
				535	if os.path.isfile(fp)]
				536	# Prepend './' to files in the top-level directory so that
				537	# something like `'/Makefile' in fp` matches in the top-level
				538	# directory as well as in subdirectories.
				539	return [fp if os.path.dirname(fp) else os.path.join(os.curdir, fp)
				540	for fp in ascii_filepaths]
				541
				542	def check_files(self):
				543	"""Check all files for all issues."""
				544	for issue_to_check in self.issues_to_check:
				545	for filepath in self.collect_files():
				546	if issue_to_check.should_check_file(filepath):
				547	issue_to_check.check_file_for_issue(filepath)
				548
				549	def output_issues(self):
				550	"""Log the issues found and their locations.
				551
				552	Return 1 if there were issues, 0 otherwise.
				553	"""
				554	integrity_return_code = 0
				555	for issue_to_check in self.issues_to_check:
				556	if issue_to_check.files_with_issues:
				557	integrity_return_code = 1
				558	issue_to_check.output_file_issues(self.logger)
				559	return integrity_return_code
				560
				561
				562	def run_main():
				563	parser = argparse.ArgumentParser(description=__doc__)
				564	parser.add_argument(
				565	"-l", "--log_file", type=str, help="path to optional output log",
				566	)
				567	check_args = parser.parse_args()
				568	integrity_check = IntegrityChecker(check_args.log_file)
				569	integrity_check.check_files()
				570	return_code = integrity_check.output_issues()
				571	sys.exit(return_code)
				572
				573
				574	if __name__ == "__main__":
				575	run_main()