From 3e2e84e77991581ed8b4cfd5db6d19c05d48e811 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Tue, 16 Jun 2026 10:38:12 -0400 Subject: [PATCH 01/15] feat(version-scanner): support target list inputs via --targets --- .../tests/unit/test_version_scanner.py | 87 +++++++++++ scripts/version_scanner/version_scanner.py | 142 +++++++++++++++--- 2 files changed, 208 insertions(+), 21 deletions(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index f76b66d4d55d..af88816fa1c8 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -782,3 +782,90 @@ def test_format_for_console(): assert "3.7" in log_str assert "python_requires = " not in log_str # Slim format doesn't print context line + +def test_parse_targets_inline_json(): + from version_scanner import parse_targets + json_str = '{"python": ["3.7", "3.8"], "protobuf": "4.25.8"}' + targets = parse_targets(json_str) + assert targets == [("python", "3.7"), ("python", "3.8"), ("protobuf", "4.25.8")] + +def test_parse_targets_inline_yaml(): + from version_scanner import parse_targets + yaml_str = """ +python: + - "3.7" + - "3.8" +protobuf: "4.25.8" +""" + targets = parse_targets(yaml_str) + assert targets == [("python", "3.7"), ("python", "3.8"), ("protobuf", "4.25.8")] + +def test_parse_targets_from_file(tmp_path): + from version_scanner import parse_targets + yaml_file = tmp_path / "targets.yaml" + yaml_file.write_text(""" +python: + - "3.7" + - "3.8" +protobuf: "4.25.8" +""") + targets = parse_targets(str(yaml_file)) + assert targets == [("python", "3.7"), ("python", "3.8"), ("protobuf", "4.25.8")] + +def test_parse_targets_invalid_syntax(): + from version_scanner import parse_targets + with pytest.raises(SystemExit) as excinfo: + parse_targets('{"invalid"') + assert excinfo.value.code == 1 + +def test_scan_repository_multi_targets(tmp_path): + # Setup files in tmp repository + file1 = tmp_path / "packages" / "pkg1" / "setup.py" + file1.parent.mkdir(parents=True) + file1.write_text("python_requires = '>=3.7'\n") + + file2 = tmp_path / "packages" / "pkg2" / "requirements.txt" + file2.parent.mkdir(parents=True) + file2.write_text("protobuf==4.25.8\n") + + # Let's mock a config file with rules for both python and protobuf + config_file = tmp_path / "regex_config.yaml" + config_file.write_text(""" +rules: + - name: python_requires_check + applies_to: + - python + rules: + - python_requires\\s*=\\s*['\"]>={version}['\"] + - name: protobuf_check + applies_to: + - protobuf + rules: + - protobuf=={version} +""") + + from version_scanner import ConfigManager, scan_repository + + targets = [("python", "3.7"), ("protobuf", "4.25.8")] + rules = [] + for dep, ver in targets: + cm = ConfigManager(str(config_file), dep, ver) + rules.extend(cm.load_config()) + + results = scan_repository(str(tmp_path), rules, targets=targets) + + # We should have 2 matches + assert len(results) == 2 + + # Match for python + python_match = [r for r in results if r["dependency"] == "python"] + assert len(python_match) == 1 + assert python_match[0]["version"] == "3.7" + assert python_match[0]["rule_name"] == "python_requires_check" + + # Match for protobuf + protobuf_match = [r for r in results if r["dependency"] == "protobuf"] + assert len(protobuf_match) == 1 + assert protobuf_match[0]["version"] == "4.25.8" + assert protobuf_match[0]["rule_name"] == "protobuf_check" + diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 484a6eacacae..00508cdb59da 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -137,7 +137,9 @@ def load_config(self) -> List[Dict[str, str]]: resolved_pattern = template.strip().format(**self.variables) resolved_rules.append({ "name": name, - "pattern": resolved_pattern + "pattern": resolved_pattern, + "dependency": self.dependency, + "version": self.version }) except KeyError as e: print(f"Warning: Missing variable for interpolation in rule {name}: {e}", file=sys.stderr) @@ -178,7 +180,9 @@ def scan_file(file_path: str, compiled_rules: List[Dict[str, re.Pattern]]) -> Li "rule_name": rule["name"], "line_number": line_num, "matched_string": match.group(0).strip(), - "context_line": line.strip() + "context_line": line.strip(), + "dependency": rule.get("dependency", ""), + "version": rule.get("version", "") }) except IOError as e: print(f"Warning: Could not read file {file_path}: {e}", file=sys.stderr) @@ -465,10 +469,11 @@ def read_package_file(file_path: str) -> List[str]: def scan_repository( root_path: str, - rules: List[Dict[str, str]], + rules: List[Dict[str, Any]], target_packages: List[str] = None, ignore_dirs: List[str] = None, - version_string: str = None + version_string: str = None, + targets: List[Tuple[str, str]] = None ) -> List[Dict[str, Any]]: """ Scans the repository directory tree applying resolved regex patterns to files. @@ -487,21 +492,30 @@ def scan_repository( performs a full recursive scan of the repository. ignore_dirs: Optional list of directory names or glob-like files to ignore (case-insensitive). version_string: Optional target version string (e.g. "3.7") to scan for in filenames. + targets: Optional list of (dependency, version) tuples. Returns: - A list of dictionaries detailing each match: 'file_path', 'repo_path', - 'package_name', 'rule_name', 'line_number', 'matched_string', 'context_line'. + A list of dictionaries detailing each match. """ ignore_lower = {i.lower() for i in ignore_dirs} if ignore_dirs else set() results = [] + filename_targets = [] + if targets: + filename_targets = targets + elif version_string: + dep = rules[0].get("dependency") if rules else None + filename_targets = [(dep, version_string)] + # Compile patterns once here compiled_rules = [] for rule in rules: try: compiled_rules.append({ "name": rule["name"], - "pattern": re.compile(rule["pattern"], re.IGNORECASE) + "pattern": re.compile(rule["pattern"], re.IGNORECASE), + "dependency": rule.get("dependency", ""), + "version": rule.get("version", "") }) except re.error as e: print(f"Error compiling regex for rule {rule['name']}: {e}", file=sys.stderr) @@ -541,13 +555,16 @@ def scan_repository( matches = scan_file(file_path, compiled_rules) # Add filename match if applicable - if version_string and version_string in file: - matches.append({ - "rule_name": "filename_match", - "line_number": 0, - "matched_string": version_string, - "context_line": f"Filename contains {version_string}" - }) + for dep, ver in filename_targets: + if ver and ver in file: + matches.append({ + "rule_name": "filename_match", + "line_number": 0, + "matched_string": ver, + "context_line": f"Filename contains {ver}", + "dependency": dep or "", + "version": ver + }) # Compute display path and package name rel_file_path = os.path.relpath(file_path, root_path) @@ -576,6 +593,46 @@ def scan_repository( return results +def parse_targets(targets_input: str) -> List[Tuple[str, str]]: + """ + Parses a targets input (file path or inline YAML/JSON string) into a list of (dependency, version) tuples. + """ + raw_targets = {} + content = targets_input + + # Check if the input is a file path + if os.path.exists(targets_input): + try: + with open(targets_input, 'r', encoding='utf-8') as f: + content = f.read() + except PermissionError: + print(f"Error: Permission denied reading targets file: {targets_input}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error reading targets file {targets_input}: {e}", file=sys.stderr) + sys.exit(1) + + try: + raw_targets = yaml.safe_load(content) + except Exception as e: + print(f"Error parsing targets YAML/JSON content: {e}", file=sys.stderr) + sys.exit(1) + + if not isinstance(raw_targets, dict): + print("Error: Targets input must resolve to a JSON object or YAML mapping", file=sys.stderr) + sys.exit(1) + + targets = [] + for dep, versions in raw_targets.items(): + if isinstance(versions, list): + for v in versions: + targets.append((str(dep), str(v))) + else: + targets.append((str(dep), str(versions))) + + return targets + + def main(): script_dir = os.path.dirname(os.path.abspath(__file__)) default_config = os.path.join(script_dir, "regex_config.yaml") @@ -586,16 +643,19 @@ def main(): parser.add_argument( "-d", "--dependency", - required=True, help="Name of the dependency (e.g., python, protobuf)" ) parser.add_argument( "-v", "--version", - required=True, help="Specific version to search for (e.g., 3.7, 4.25.8)" ) + parser.add_argument( + "--targets", + help="Path to a YAML/JSON targets file, or an inline YAML/JSON string (e.g. 'python: [3.8, 3.9]')" + ) + parser.add_argument( "-p", "--path", default=".", @@ -659,6 +719,25 @@ def main(): args = parser.parse_args() + # Validation of required inputs + has_single_target = bool(args.dependency and args.version) + has_targets_list = bool(args.targets) + + if not (has_single_target or has_targets_list): + parser.error("Must specify either (-d/--dependency AND -v/--version) OR (--targets)") + if has_single_target and has_targets_list: + parser.error("Cannot specify both single target (-d/-v) and targets list (--targets)") + + targets = [] + if has_targets_list: + targets = parse_targets(args.targets) + else: + targets = [(args.dependency, args.version)] + + if not targets: + print("Error: No targets resolved to scan.", file=sys.stderr) + sys.exit(1) + # Resolve target packages if filtering is requested target_packages = [] if args.package: @@ -670,7 +749,12 @@ def main(): elif args.package_file: target_packages = read_package_file(args.package_file) - print(f"Starting scan for dependency: {args.dependency} version: {args.version}") + if has_targets_list: + print("Starting scan for multiple targets:") + for dep, ver in targets: + print(f" - {dep}: {ver}") + else: + print(f"Starting scan for dependency: {args.dependency} version: {args.version}") print(f"Root path: {args.path}") print("Targets to scan:") if target_packages: @@ -681,8 +765,10 @@ def main(): print(f"Using config: {args.config}") # Load and resolve rules - config_manager = ConfigManager(args.config, args.dependency, args.version) - rules = config_manager.load_config() + rules = [] + for dep, ver in targets: + config_manager = ConfigManager(args.config, dep, ver) + rules.extend(config_manager.load_config()) @@ -695,7 +781,14 @@ def main(): print(f"Loaded {len(ignore_dirs)} ignore patterns from {ignore_file_path}") # Scan repository - all_matches = scan_repository(args.path, rules, target_packages, ignore_dirs, version_string=args.version) + all_matches = scan_repository( + args.path, + rules, + target_packages, + ignore_dirs, + version_string=(None if has_targets_list else args.version), + targets=targets + ) print(f"\nFound {len(all_matches)} matches.") display_matches = all_matches if args.stdout else all_matches[:10] @@ -717,7 +810,14 @@ def main(): script_dir = os.path.dirname(os.path.abspath(__file__)) results_dir = os.path.join(script_dir, "results") os.makedirs(results_dir, exist_ok=True) - output_path = os.path.join(results_dir, f"{args.dependency}-{args.version}-{timestamp}.csv") + if has_targets_list: + if os.path.exists(args.targets): + base_name = os.path.splitext(os.path.basename(args.targets))[0] + else: + base_name = "targets" + output_path = os.path.join(results_dir, f"{base_name}-{timestamp}.csv") + else: + output_path = os.path.join(results_dir, f"{args.dependency}-{args.version}-{timestamp}.csv") write_csv_report(output_path, all_matches) From d26715a8bf9d0e4070e17cd0f959932bd9417d65 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 17 Jun 2026 04:52:57 -0400 Subject: [PATCH 02/15] feat(version-scanner): simplify targets list input to accept only YAML files --- .../integration/test_scanner_integration.py | 3 +- .../tests/unit/test_version_scanner.py | 54 ++++++++------ scripts/version_scanner/version_scanner.py | 71 +++++++++---------- 3 files changed, 68 insertions(+), 60 deletions(-) diff --git a/scripts/version_scanner/tests/integration/test_scanner_integration.py b/scripts/version_scanner/tests/integration/test_scanner_integration.py index 36eff2402d38..3ce6d2cd3ab1 100644 --- a/scripts/version_scanner/tests/integration/test_scanner_integration.py +++ b/scripts/version_scanner/tests/integration/test_scanner_integration.py @@ -32,7 +32,8 @@ def test_integration_scan(tmp_path): "-v", "3.7", "-p", data_dir, "--config", config_path, - "-o", "scanner_report.csv" + "-o", "scanner_report.csv", + "--soft-fail" ] result = subprocess.run(cmd, cwd=tmp_path, capture_output=True, text=True, check=True) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index af88816fa1c8..7a8487d67d1d 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -783,39 +783,49 @@ def test_format_for_console(): assert "python_requires = " not in log_str # Slim format doesn't print context line -def test_parse_targets_inline_json(): - from version_scanner import parse_targets - json_str = '{"python": ["3.7", "3.8"], "protobuf": "4.25.8"}' - targets = parse_targets(json_str) - assert targets == [("python", "3.7"), ("python", "3.8"), ("protobuf", "4.25.8")] - -def test_parse_targets_inline_yaml(): - from version_scanner import parse_targets - yaml_str = """ +def test_parse_targets_file(tmp_path): + from version_scanner import parse_targets_file + yaml_file = tmp_path / "targets.yaml" + yaml_file.write_text(""" python: - "3.7" - "3.8" protobuf: "4.25.8" -""" - targets = parse_targets(yaml_str) +""") + targets = parse_targets_file(str(yaml_file)) assert targets == [("python", "3.7"), ("python", "3.8"), ("protobuf", "4.25.8")] -def test_parse_targets_from_file(tmp_path): - from version_scanner import parse_targets +def test_parse_targets_file_not_found(): + from version_scanner import parse_targets_file + with pytest.raises(SystemExit) as excinfo: + parse_targets_file("nonexistent_file.yaml") + assert excinfo.value.code == 1 + +def test_parse_targets_file_invalid_yaml(tmp_path): + from version_scanner import parse_targets_file + yaml_file = tmp_path / "targets.yaml" + yaml_file.write_text("invalid: {") + with pytest.raises(SystemExit) as excinfo: + parse_targets_file(str(yaml_file)) + assert excinfo.value.code == 1 + +def test_parse_targets_file_invalid_structure(tmp_path): + from version_scanner import parse_targets_file + yaml_file = tmp_path / "targets.yaml" + yaml_file.write_text("- not_a_mapping") + with pytest.raises(SystemExit) as excinfo: + parse_targets_file(str(yaml_file)) + assert excinfo.value.code == 1 + +def test_parse_targets_file_invalid_version_type(tmp_path): + from version_scanner import parse_targets_file yaml_file = tmp_path / "targets.yaml" yaml_file.write_text(""" python: - - "3.7" - - "3.8" -protobuf: "4.25.8" + - null """) - targets = parse_targets(str(yaml_file)) - assert targets == [("python", "3.7"), ("python", "3.8"), ("protobuf", "4.25.8")] - -def test_parse_targets_invalid_syntax(): - from version_scanner import parse_targets with pytest.raises(SystemExit) as excinfo: - parse_targets('{"invalid"') + parse_targets_file(str(yaml_file)) assert excinfo.value.code == 1 def test_scan_repository_multi_targets(tmp_path): diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 00508cdb59da..a0d9c7956965 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -593,46 +593,46 @@ def scan_repository( return results -def parse_targets(targets_input: str) -> List[Tuple[str, str]]: +def parse_targets_file(file_path: str) -> List[Tuple[str, str]]: """ - Parses a targets input (file path or inline YAML/JSON string) into a list of (dependency, version) tuples. + Parses a YAML targets file into a list of (dependency, version) tuples. """ - raw_targets = {} - content = targets_input - - # Check if the input is a file path - if os.path.exists(targets_input): - try: - with open(targets_input, 'r', encoding='utf-8') as f: - content = f.read() - except PermissionError: - print(f"Error: Permission denied reading targets file: {targets_input}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"Error reading targets file {targets_input}: {e}", file=sys.stderr) - sys.exit(1) - + if not os.path.exists(file_path): + print(f"Error: Targets file not found: {file_path}", file=sys.stderr) + sys.exit(1) + try: - raw_targets = yaml.safe_load(content) + with open(file_path, 'r', encoding='utf-8') as f: + raw_targets = yaml.safe_load(f) + except PermissionError: + print(f"Error: Permission denied reading targets file: {file_path}", file=sys.stderr) + sys.exit(1) except Exception as e: - print(f"Error parsing targets YAML/JSON content: {e}", file=sys.stderr) + print(f"Error reading or parsing targets file {file_path}: {e}", file=sys.stderr) sys.exit(1) if not isinstance(raw_targets, dict): - print("Error: Targets input must resolve to a JSON object or YAML mapping", file=sys.stderr) + print("Error: Targets file content must resolve to a YAML mapping", file=sys.stderr) sys.exit(1) targets = [] for dep, versions in raw_targets.items(): if isinstance(versions, list): for v in versions: + if v is None or isinstance(v, (dict, list)): + print(f"Error: Invalid version '{v}' for dependency '{dep}'", file=sys.stderr) + sys.exit(1) targets.append((str(dep), str(v))) - else: + elif versions is not None and not isinstance(versions, dict): targets.append((str(dep), str(versions))) + else: + print(f"Error: Invalid version '{versions}' for dependency '{dep}'", file=sys.stderr) + sys.exit(1) return targets + def main(): script_dir = os.path.dirname(os.path.abspath(__file__)) default_config = os.path.join(script_dir, "regex_config.yaml") @@ -652,8 +652,8 @@ def main(): ) parser.add_argument( - "--targets", - help="Path to a YAML/JSON targets file, or an inline YAML/JSON string (e.g. 'python: [3.8, 3.9]')" + "--targets-file", + help="Path to a YAML file containing target dependencies and versions." ) parser.add_argument( @@ -721,16 +721,16 @@ def main(): # Validation of required inputs has_single_target = bool(args.dependency and args.version) - has_targets_list = bool(args.targets) + has_targets_file = bool(args.targets_file) - if not (has_single_target or has_targets_list): - parser.error("Must specify either (-d/--dependency AND -v/--version) OR (--targets)") - if has_single_target and has_targets_list: - parser.error("Cannot specify both single target (-d/-v) and targets list (--targets)") + if not (has_single_target or has_targets_file): + parser.error("Must specify either (-d/--dependency AND -v/--version) OR (--targets-file)") + if has_single_target and has_targets_file: + parser.error("Cannot specify both single target (-d/-v) and targets file (--targets-file)") targets = [] - if has_targets_list: - targets = parse_targets(args.targets) + if has_targets_file: + targets = parse_targets_file(args.targets_file) else: targets = [(args.dependency, args.version)] @@ -749,7 +749,7 @@ def main(): elif args.package_file: target_packages = read_package_file(args.package_file) - if has_targets_list: + if has_targets_file: print("Starting scan for multiple targets:") for dep, ver in targets: print(f" - {dep}: {ver}") @@ -786,7 +786,7 @@ def main(): rules, target_packages, ignore_dirs, - version_string=(None if has_targets_list else args.version), + version_string=(None if has_targets_file else args.version), targets=targets ) @@ -810,11 +810,8 @@ def main(): script_dir = os.path.dirname(os.path.abspath(__file__)) results_dir = os.path.join(script_dir, "results") os.makedirs(results_dir, exist_ok=True) - if has_targets_list: - if os.path.exists(args.targets): - base_name = os.path.splitext(os.path.basename(args.targets))[0] - else: - base_name = "targets" + if has_targets_file: + base_name = os.path.splitext(os.path.basename(args.targets_file))[0] output_path = os.path.join(results_dir, f"{base_name}-{timestamp}.csv") else: output_path = os.path.join(results_dir, f"{args.dependency}-{args.version}-{timestamp}.csv") From 156f2b80001745de8352c4cb98dfc38520cc45c5 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 17 Jun 2026 05:00:48 -0400 Subject: [PATCH 03/15] test(version-scanner): parametrize targets file failure tests --- .../tests/unit/test_version_scanner.py | 48 ++++++++----------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 7a8487d67d1d..325f510abdbe 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -795,37 +795,27 @@ def test_parse_targets_file(tmp_path): targets = parse_targets_file(str(yaml_file)) assert targets == [("python", "3.7"), ("python", "3.8"), ("protobuf", "4.25.8")] -def test_parse_targets_file_not_found(): - from version_scanner import parse_targets_file - with pytest.raises(SystemExit) as excinfo: - parse_targets_file("nonexistent_file.yaml") - assert excinfo.value.code == 1 - -def test_parse_targets_file_invalid_yaml(tmp_path): - from version_scanner import parse_targets_file - yaml_file = tmp_path / "targets.yaml" - yaml_file.write_text("invalid: {") - with pytest.raises(SystemExit) as excinfo: - parse_targets_file(str(yaml_file)) - assert excinfo.value.code == 1 - -def test_parse_targets_file_invalid_structure(tmp_path): - from version_scanner import parse_targets_file - yaml_file = tmp_path / "targets.yaml" - yaml_file.write_text("- not_a_mapping") - with pytest.raises(SystemExit) as excinfo: - parse_targets_file(str(yaml_file)) - assert excinfo.value.code == 1 - -def test_parse_targets_file_invalid_version_type(tmp_path): +@pytest.mark.parametrize( + "file_content, file_exists", + [ + (None, False), # File not found + ("invalid: {", True), # Invalid YAML + ("- not_a_mapping", True), # Invalid structure (list instead of map) + ("python:\n - null", True), # Invalid version type (null/None value) + ] +) +def test_parse_targets_file_failures(tmp_path, file_content, file_exists): from version_scanner import parse_targets_file - yaml_file = tmp_path / "targets.yaml" - yaml_file.write_text(""" -python: - - null -""") + + if file_exists: + yaml_file = tmp_path / "targets_failures.yaml" + yaml_file.write_text(file_content) + path = str(yaml_file) + else: + path = "nonexistent_file.yaml" + with pytest.raises(SystemExit) as excinfo: - parse_targets_file(str(yaml_file)) + parse_targets_file(path) assert excinfo.value.code == 1 def test_scan_repository_multi_targets(tmp_path): From f1c47bb71a78d273feeb51b6f81b9084cabf8c00 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 17 Jun 2026 05:09:00 -0400 Subject: [PATCH 04/15] test(version-scanner): refactor formatting tests to use a shared sample_match fixture --- .../tests/unit/test_version_scanner.py | 78 +++++-------------- 1 file changed, 19 insertions(+), 59 deletions(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 325f510abdbe..1d35ec70f5e6 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -682,21 +682,9 @@ def test_safe_int(): assert _safe_int(None) == 0 assert _safe_int("abc") == 0 -def test_format_for_raw_csv_handles_empty_line_number(): - match = { - "file_path": "google-cloud-python/main/packages/pkg_a/setup.py", - "repo_path": "packages/pkg_a/setup.py", - "package_name": "pkg_a", - "rule_name": "python_requires_check", - "line_number": "", - "matched_string": "3.7", - "context_line": "python_requires = '>=3.7'" - } - formatted = format_for_raw_csv(match) - assert formatted["line_number"] == 0 - -def test_format_for_raw_csv(): - match = { +@pytest.fixture +def sample_match(): + return { "file_name": "setup.py", "file_path": "google-cloud-python/main/packages/pkg_a/setup.py", "repo_path": "packages/pkg_a/setup.py", @@ -708,8 +696,14 @@ def test_format_for_raw_csv(): "dependency": "python", "version": "3.7" } - - formatted = format_for_raw_csv(match) + +def test_format_for_raw_csv_handles_empty_line_number(sample_match): + sample_match["line_number"] = "" + formatted = format_for_raw_csv(sample_match) + assert formatted["line_number"] == 0 + +def test_format_for_raw_csv(sample_match): + formatted = format_for_raw_csv(sample_match) assert formatted["file_name"] == "setup.py" assert formatted["file_path"] == "google-cloud-python/main/packages/pkg_a/setup.py" @@ -721,38 +715,14 @@ def test_format_for_raw_csv(): assert formatted["dependency"] == "python" assert formatted["version"] == "3.7" -def test_format_for_raw_csv_fallback_filename(): - match = { - "file_path": "google-cloud-python/main/packages/pkg_a/setup.py", - "repo_path": "packages/pkg_a/setup.py", - "package_name": "pkg_a", - "rule_name": "python_requires_check", - "line_number": "123", - "matched_string": "3.7", - "context_line": "python_requires = '>=3.7'", - "dependency": "python", - "version": "3.7" - } - - formatted = format_for_raw_csv(match) +def test_format_for_raw_csv_fallback_filename(sample_match): + del sample_match["file_name"] + formatted = format_for_raw_csv(sample_match) assert formatted["file_name"] == "setup.py" -def test_format_for_spreadsheet(): - match = { - "file_name": "setup.py", - "file_path": "google-cloud-python/main/packages/pkg_a/setup.py", - "repo_path": "packages/pkg_a/setup.py", - "package_name": "pkg_a", - "rule_name": "python_requires_check", - "line_number": 123, - "matched_string": "3.7", - "context_line": "python_requires = '>=3.7'", - "dependency": "python", - "version": "3.7" - } - +def test_format_for_spreadsheet(sample_match): # Without github_repo - formatted_no_repo = format_for_spreadsheet(match) + formatted_no_repo = format_for_spreadsheet(sample_match) assert formatted_no_repo["file_name"] == "setup.py" assert formatted_no_repo["line_number"] == 123 assert formatted_no_repo["matched_string"] == '="3.7"' # Decimal protection formula @@ -760,23 +730,13 @@ def test_format_for_spreadsheet(): assert formatted_no_repo["version"] == "3.7" # With github_repo - formatted_repo = format_for_spreadsheet(match, github_repo="https://github.com/user/repo", branch="main") + formatted_repo = format_for_spreadsheet(sample_match, github_repo="https://github.com/user/repo", branch="main") expected_url = "https://github.com/user/repo/blob/main/packages/pkg_a/setup.py#L123" assert formatted_repo["line_number"] == f'=HYPERLINK("{expected_url}", "123")' assert formatted_repo["matched_string"] == '="3.7"' -def test_format_for_console(): - match = { - "file_path": "google-cloud-python/main/packages/pkg_a/setup.py", - "repo_path": "packages/pkg_a/setup.py", - "package_name": "pkg_a", - "rule_name": "python_requires_check", - "line_number": 123, - "matched_string": "3.7", - "context_line": "python_requires = '>=3.7'" - } - - log_str = format_for_console(match) +def test_format_for_console(sample_match): + log_str = format_for_console(sample_match) assert "google-cloud-python/main/packages/pkg_a/setup.py:123" in log_str assert "[python_requires_check]" in log_str assert "3.7" in log_str From 3c0b8fe2381d527c5c5ea2442de0456ed44cb958 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 17 Jun 2026 05:25:41 -0400 Subject: [PATCH 05/15] chore(version-scanner): move sample_match fixture to the top of test file --- .../tests/unit/test_version_scanner.py | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 1d35ec70f5e6..fd2904db00c2 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -32,6 +32,22 @@ format_for_console ) +@pytest.fixture +def sample_match(): + return { + "file_name": "setup.py", + "file_path": "google-cloud-python/main/packages/pkg_a/setup.py", + "repo_path": "packages/pkg_a/setup.py", + "package_name": "pkg_a", + "rule_name": "python_requires_check", + "line_number": "123", + "matched_string": "3.7", + "context_line": "python_requires = '>=3.7'", + "dependency": "python", + "version": "3.7" + } + + # Test ConfigManager @pytest.mark.parametrize("dependency, version, expected", [ ( @@ -682,21 +698,6 @@ def test_safe_int(): assert _safe_int(None) == 0 assert _safe_int("abc") == 0 -@pytest.fixture -def sample_match(): - return { - "file_name": "setup.py", - "file_path": "google-cloud-python/main/packages/pkg_a/setup.py", - "repo_path": "packages/pkg_a/setup.py", - "package_name": "pkg_a", - "rule_name": "python_requires_check", - "line_number": "123", - "matched_string": "3.7", - "context_line": "python_requires = '>=3.7'", - "dependency": "python", - "version": "3.7" - } - def test_format_for_raw_csv_handles_empty_line_number(sample_match): sample_match["line_number"] = "" formatted = format_for_raw_csv(sample_match) From 00d409d4aed3a32c7d83813e2566fbd4ef65ddf0 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 17 Jun 2026 05:43:54 -0400 Subject: [PATCH 06/15] refactor(version-scanner): consolidate file reading and error handling under _safe_read_file --- scripts/version_scanner/version_scanner.py | 109 +++++++++++++-------- 1 file changed, 66 insertions(+), 43 deletions(-) diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index a0d9c7956965..6205e8effadd 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -23,9 +23,57 @@ import os import re import sys -from typing import Dict, List, Tuple, Any +from typing import Dict, List, Tuple, Any, Optional import yaml + +def _safe_read_file( + file_path: str, + required: bool = True, + description: str = "file", + silent_missing: bool = False +) -> Optional[str]: + """ + Safely reads file content and handles common file errors. + + Args: + file_path: Path to the file. + required: If True, exits the program with code 1 on read failure. + If False, prints a warning (or ignores) and returns None. + description: Description of the file type for error logging. + silent_missing: If True, silently ignores FileNotFoundError (returns None). + + Returns: + The file content string, or None if reading failed/was ignored. + """ + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + except FileNotFoundError: + if silent_missing: + return None + if required: + print(f"Error: {description.capitalize()} not found: {file_path}", file=sys.stderr) + sys.exit(1) + else: + print(f"Warning: {description.capitalize()} not found: {file_path}", file=sys.stderr) + return None + except PermissionError: + if required: + print(f"Error: Permission denied reading {description}: {file_path}", file=sys.stderr) + sys.exit(1) + else: + print(f"Warning: Permission denied reading {description}: {file_path}", file=sys.stderr) + return None + except IOError as e: + if required: + print(f"Error reading {description} {file_path}: {e}", file=sys.stderr) + sys.exit(1) + else: + print(f"Warning: Error reading {description} {file_path}: {e}", file=sys.stderr) + return None + + class ConfigManager: """ Handles loading, validation, and interpolation of the regex configuration rules. @@ -106,15 +154,9 @@ def _compute_variables(self) -> Dict[str, str]: def load_config(self) -> List[Dict[str, str]]: """Load and resolve rules from config.""" + content = _safe_read_file(self.config_path, required=True, description="config file") try: - with open(self.config_path, 'r', encoding='utf-8') as f: - config = yaml.safe_load(f) - except FileNotFoundError: - print(f"Error: Config file not found: {self.config_path}", file=sys.stderr) - sys.exit(1) - except PermissionError: - print(f"Error: Permission denied reading config file: {self.config_path}", file=sys.stderr) - sys.exit(1) + config = yaml.safe_load(content) except yaml.YAMLError as e: print(f"Error parsing config file: {e}", file=sys.stderr) sys.exit(1) @@ -330,14 +372,12 @@ def load_ignore_file(file_path: str) -> List[str]: Read ignore paths from a file. """ ignore_dirs = [] - try: - with open(file_path, 'r', encoding='utf-8') as f: - for line in f: - line = line.strip() - if line and not line.startswith('#'): - ignore_dirs.append(line) - except FileNotFoundError: - pass + content = _safe_read_file(file_path, required=False, silent_missing=True) + if content: + for line in content.splitlines(): + line = line.strip() + if line and not line.startswith('#'): + ignore_dirs.append(line) return ignore_dirs @@ -449,21 +489,12 @@ def read_package_file(file_path: str) -> List[str]: A list of package paths. """ packages = [] - try: - with open(file_path, 'r', encoding='utf-8') as f: - for line in f: - line = line.strip() - if line and not line.startswith('#'): - packages.append(line) - except FileNotFoundError: - print(f"Error: Package file not found: {file_path}", file=sys.stderr) - sys.exit(1) - except PermissionError: - print(f"Error: Permission denied reading package file: {file_path}", file=sys.stderr) - sys.exit(1) - except IOError as e: - print(f"Error reading package file: {e}", file=sys.stderr) - sys.exit(1) + content = _safe_read_file(file_path, required=True, description="package file") + if content: + for line in content.splitlines(): + line = line.strip() + if line and not line.startswith('#'): + packages.append(line) return packages @@ -597,18 +628,11 @@ def parse_targets_file(file_path: str) -> List[Tuple[str, str]]: """ Parses a YAML targets file into a list of (dependency, version) tuples. """ - if not os.path.exists(file_path): - print(f"Error: Targets file not found: {file_path}", file=sys.stderr) - sys.exit(1) - + content = _safe_read_file(file_path, required=True, description="targets file") try: - with open(file_path, 'r', encoding='utf-8') as f: - raw_targets = yaml.safe_load(f) - except PermissionError: - print(f"Error: Permission denied reading targets file: {file_path}", file=sys.stderr) - sys.exit(1) + raw_targets = yaml.safe_load(content) except Exception as e: - print(f"Error reading or parsing targets file {file_path}: {e}", file=sys.stderr) + print(f"Error parsing targets YAML mapping: {e}", file=sys.stderr) sys.exit(1) if not isinstance(raw_targets, dict): @@ -632,7 +656,6 @@ def parse_targets_file(file_path: str) -> List[Tuple[str, str]]: return targets - def main(): script_dir = os.path.dirname(os.path.abspath(__file__)) default_config = os.path.join(script_dir, "regex_config.yaml") From 437a13ffa9d7f0b9c9144cb854ed4c1695d8ad29 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 17 Jun 2026 05:47:21 -0400 Subject: [PATCH 07/15] test(version-scanner): add parametrized unit tests for _safe_read_file helper --- .../tests/unit/test_version_scanner.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index fd2904db00c2..054df22421bc 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -48,6 +48,44 @@ def sample_match(): } +@pytest.mark.parametrize( + "exception_to_raise, required, silent_missing, expected_exit, expected_output, expected_return", + [ + (None, True, False, False, None, "file content"), # Success + (FileNotFoundError(), True, True, False, None, None), # Silent missing FileNotFoundError + (FileNotFoundError(), True, False, True, "Error: Test_desc not found", None), # Required FileNotFoundError + (FileNotFoundError(), False, False, False, "Warning: Test_desc not found", None), # Optional FileNotFoundError + (PermissionError(), True, False, True, "Error: Permission denied reading test_desc", None), # Required PermissionError + (PermissionError(), False, False, False, "Warning: Permission denied reading test_desc", None), # Optional PermissionError + (IOError("disk full"), True, False, True, "Error reading test_desc", None), # Required IOError + (IOError("disk full"), False, False, False, "Warning: Error reading test_desc", None), # Optional IOError + ] +) +def test_safe_read_file_scenarios( + capsys, exception_to_raise, required, silent_missing, expected_exit, expected_output, expected_return +): + from version_scanner import _safe_read_file + + if exception_to_raise: + mock_open = mock.mock_open() + mock_open.side_effect = exception_to_raise + else: + mock_open = mock.mock_open(read_data="file content") + + with patch("builtins.open", mock_open): + if expected_exit: + with pytest.raises(SystemExit) as excinfo: + _safe_read_file("dummy.txt", required=required, description="test_desc", silent_missing=silent_missing) + assert excinfo.value.code == 1 + else: + res = _safe_read_file("dummy.txt", required=required, description="test_desc", silent_missing=silent_missing) + assert res == expected_return + + if expected_output: + captured = capsys.readouterr() + assert expected_output in captured.err + + # Test ConfigManager @pytest.mark.parametrize("dependency, version, expected", [ ( From 5011c0b0c9e97b0bfddafced9cbef4a387149da3 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Tue, 23 Jun 2026 10:22:41 -0400 Subject: [PATCH 08/15] chore(version-scanner): configure GHA to use targets file for multi-version scanning --- .github/workflows/version_scanner.yml | 2 +- scripts/version_scanner/targets.yaml | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 scripts/version_scanner/targets.yaml diff --git a/.github/workflows/version_scanner.yml b/.github/workflows/version_scanner.yml index 52f813e67995..4e7845e9f447 100644 --- a/.github/workflows/version_scanner.yml +++ b/.github/workflows/version_scanner.yml @@ -35,7 +35,7 @@ jobs: # Uses -o to output a detailed, raw CSV to a file # Uses --stdout to print a slim, easier to parse summary to the GitHub Actions UI # Uses --soft-fail to temporarily limit causing CI/CD failures during the migration to full operation. - python scripts/version_scanner/version_scanner.py -d python -v 3.7 --stdout -o version_scanner_output.csv --soft-fail + python scripts/version_scanner/version_scanner.py --targets-file scripts/version_scanner/targets.yaml --stdout -o version_scanner_output.csv --soft-fail - name: Upload CSV Results if: always() diff --git a/scripts/version_scanner/targets.yaml b/scripts/version_scanner/targets.yaml new file mode 100644 index 000000000000..0f50b31fd48f --- /dev/null +++ b/scripts/version_scanner/targets.yaml @@ -0,0 +1,4 @@ +python: + - "3.7" + - "3.8" + - "3.9" From 8fc7e61afdfff26c8ace6eb3aacd3855586771a4 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Tue, 23 Jun 2026 10:24:56 -0400 Subject: [PATCH 09/15] chore(version-scanner): limit GHA workflow to scan only handwritten and hybrid packages --- .github/workflows/version_scanner.yml | 2 +- .../python-310-package-list.txt | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 scripts/version_scanner/python-310-package-list.txt diff --git a/.github/workflows/version_scanner.yml b/.github/workflows/version_scanner.yml index 4e7845e9f447..52aea19dca59 100644 --- a/.github/workflows/version_scanner.yml +++ b/.github/workflows/version_scanner.yml @@ -35,7 +35,7 @@ jobs: # Uses -o to output a detailed, raw CSV to a file # Uses --stdout to print a slim, easier to parse summary to the GitHub Actions UI # Uses --soft-fail to temporarily limit causing CI/CD failures during the migration to full operation. - python scripts/version_scanner/version_scanner.py --targets-file scripts/version_scanner/targets.yaml --stdout -o version_scanner_output.csv --soft-fail + python scripts/version_scanner/version_scanner.py --targets-file scripts/version_scanner/targets.yaml --package-file scripts/version_scanner/python-310-package-list.txt --stdout -o version_scanner_output.csv --soft-fail - name: Upload CSV Results if: always() diff --git a/scripts/version_scanner/python-310-package-list.txt b/scripts/version_scanner/python-310-package-list.txt new file mode 100644 index 000000000000..bfc1e3fe8658 --- /dev/null +++ b/scripts/version_scanner/python-310-package-list.txt @@ -0,0 +1,31 @@ +packages/bigframes +packages/bigquery-magics +packages/db-dtypes +packages/django-google-spanner +packages/gapic-generator +packages/google-api-core +# packages/google-api-python-client # non-monorepo, ignore for now. +packages/google-auth +packages/google-auth-httplib2 +packages/google-auth-oauthlib +packages/google-cloud-bigquery +packages/pandas-gbq +packages/google-cloud-bigtable +packages/google-cloud-core +packages/google-crc32c +packages/google-cloud-datastore +packages/google-cloud-dns +packages/google-cloud-documentai-toolbox +packages/google-cloud-error-reporting +packages/google-cloud-firestore +packages/google-cloud-logging +packages/google-cloud-ndb +packages/google-cloud-pubsub +packages/google-cloud-runtimeconfig +packages/google-cloud-spanner +packages/google-cloud-storage +packages/google-cloud-testutils +packages/google-resumable-media +packages/proto-plus +packages/sqlalchemy-bigquery +packages/sqlalchemy-spanner From 9f612dade16ad0ebd892971ae8d5f11e2e61601b Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Tue, 23 Jun 2026 11:20:59 -0400 Subject: [PATCH 10/15] refactor(version-scanner): rename targets file to matrix file to resolve ambiguity --- .github/workflows/version_scanner.yml | 2 +- .../{targets.yaml => matrix.yaml} | 0 .../tests/unit/test_version_scanner.py | 16 ++++---- scripts/version_scanner/version_scanner.py | 40 +++++++++---------- 4 files changed, 29 insertions(+), 29 deletions(-) rename scripts/version_scanner/{targets.yaml => matrix.yaml} (100%) diff --git a/.github/workflows/version_scanner.yml b/.github/workflows/version_scanner.yml index 52aea19dca59..aec449a54fe3 100644 --- a/.github/workflows/version_scanner.yml +++ b/.github/workflows/version_scanner.yml @@ -35,7 +35,7 @@ jobs: # Uses -o to output a detailed, raw CSV to a file # Uses --stdout to print a slim, easier to parse summary to the GitHub Actions UI # Uses --soft-fail to temporarily limit causing CI/CD failures during the migration to full operation. - python scripts/version_scanner/version_scanner.py --targets-file scripts/version_scanner/targets.yaml --package-file scripts/version_scanner/python-310-package-list.txt --stdout -o version_scanner_output.csv --soft-fail + python scripts/version_scanner/version_scanner.py --matrix-file scripts/version_scanner/matrix.yaml --package-file scripts/version_scanner/python-310-package-list.txt --stdout -o version_scanner_output.csv --soft-fail - name: Upload CSV Results if: always() diff --git a/scripts/version_scanner/targets.yaml b/scripts/version_scanner/matrix.yaml similarity index 100% rename from scripts/version_scanner/targets.yaml rename to scripts/version_scanner/matrix.yaml diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 054df22421bc..4d7fbebe9401 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -782,16 +782,16 @@ def test_format_for_console(sample_match): assert "python_requires = " not in log_str # Slim format doesn't print context line -def test_parse_targets_file(tmp_path): - from version_scanner import parse_targets_file - yaml_file = tmp_path / "targets.yaml" +def test_parse_matrix_file(tmp_path): + from version_scanner import parse_matrix_file + yaml_file = tmp_path / "matrix.yaml" yaml_file.write_text(""" python: - "3.7" - "3.8" protobuf: "4.25.8" """) - targets = parse_targets_file(str(yaml_file)) + targets = parse_matrix_file(str(yaml_file)) assert targets == [("python", "3.7"), ("python", "3.8"), ("protobuf", "4.25.8")] @pytest.mark.parametrize( @@ -803,18 +803,18 @@ def test_parse_targets_file(tmp_path): ("python:\n - null", True), # Invalid version type (null/None value) ] ) -def test_parse_targets_file_failures(tmp_path, file_content, file_exists): - from version_scanner import parse_targets_file +def test_parse_matrix_file_failures(tmp_path, file_content, file_exists): + from version_scanner import parse_matrix_file if file_exists: - yaml_file = tmp_path / "targets_failures.yaml" + yaml_file = tmp_path / "matrix_failures.yaml" yaml_file.write_text(file_content) path = str(yaml_file) else: path = "nonexistent_file.yaml" with pytest.raises(SystemExit) as excinfo: - parse_targets_file(path) + parse_matrix_file(path) assert excinfo.value.code == 1 def test_scan_repository_multi_targets(tmp_path): diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 6205e8effadd..5b7d2196c3a2 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -624,23 +624,23 @@ def scan_repository( return results -def parse_targets_file(file_path: str) -> List[Tuple[str, str]]: +def parse_matrix_file(file_path: str) -> List[Tuple[str, str]]: """ - Parses a YAML targets file into a list of (dependency, version) tuples. + Parses a YAML matrix file into a list of (dependency, version) tuples. """ - content = _safe_read_file(file_path, required=True, description="targets file") + content = _safe_read_file(file_path, required=True, description="matrix file") try: - raw_targets = yaml.safe_load(content) + raw_matrix = yaml.safe_load(content) except Exception as e: - print(f"Error parsing targets YAML mapping: {e}", file=sys.stderr) + print(f"Error parsing matrix YAML mapping: {e}", file=sys.stderr) sys.exit(1) - if not isinstance(raw_targets, dict): - print("Error: Targets file content must resolve to a YAML mapping", file=sys.stderr) + if not isinstance(raw_matrix, dict): + print("Error: Matrix file content must resolve to a YAML mapping", file=sys.stderr) sys.exit(1) targets = [] - for dep, versions in raw_targets.items(): + for dep, versions in raw_matrix.items(): if isinstance(versions, list): for v in versions: if v is None or isinstance(v, (dict, list)): @@ -675,7 +675,7 @@ def main(): ) parser.add_argument( - "--targets-file", + "-m", "--matrix-file", help="Path to a YAML file containing target dependencies and versions." ) @@ -744,16 +744,16 @@ def main(): # Validation of required inputs has_single_target = bool(args.dependency and args.version) - has_targets_file = bool(args.targets_file) + has_matrix_file = bool(args.matrix_file) - if not (has_single_target or has_targets_file): - parser.error("Must specify either (-d/--dependency AND -v/--version) OR (--targets-file)") - if has_single_target and has_targets_file: - parser.error("Cannot specify both single target (-d/-v) and targets file (--targets-file)") + if not (has_single_target or has_matrix_file): + parser.error("Must specify either (-d/--dependency AND -v/--version) OR (-m/--matrix-file)") + if has_single_target and has_matrix_file: + parser.error("Cannot specify both single target (-d/-v) and matrix file (-m/--matrix-file)") targets = [] - if has_targets_file: - targets = parse_targets_file(args.targets_file) + if has_matrix_file: + targets = parse_matrix_file(args.matrix_file) else: targets = [(args.dependency, args.version)] @@ -772,7 +772,7 @@ def main(): elif args.package_file: target_packages = read_package_file(args.package_file) - if has_targets_file: + if has_matrix_file: print("Starting scan for multiple targets:") for dep, ver in targets: print(f" - {dep}: {ver}") @@ -809,7 +809,7 @@ def main(): rules, target_packages, ignore_dirs, - version_string=(None if has_targets_file else args.version), + version_string=(None if has_matrix_file else args.version), targets=targets ) @@ -833,8 +833,8 @@ def main(): script_dir = os.path.dirname(os.path.abspath(__file__)) results_dir = os.path.join(script_dir, "results") os.makedirs(results_dir, exist_ok=True) - if has_targets_file: - base_name = os.path.splitext(os.path.basename(args.targets_file))[0] + if has_matrix_file: + base_name = os.path.splitext(os.path.basename(args.matrix_file))[0] output_path = os.path.join(results_dir, f"{base_name}-{timestamp}.csv") else: output_path = os.path.join(results_dir, f"{args.dependency}-{args.version}-{timestamp}.csv") From d83549145e4f3233488199f03152b19435eb5c93 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Tue, 23 Jun 2026 11:31:17 -0400 Subject: [PATCH 11/15] refactor(version-scanner): rename package list file to example-list-non-generated-packages --- .github/workflows/version_scanner.yml | 2 +- ...age-list.txt => example-list-non-generated-packages.txt} | 0 scripts/version_scanner/small_package_list.txt | 6 ------ 3 files changed, 1 insertion(+), 7 deletions(-) rename scripts/version_scanner/{python-310-package-list.txt => example-list-non-generated-packages.txt} (100%) delete mode 100644 scripts/version_scanner/small_package_list.txt diff --git a/.github/workflows/version_scanner.yml b/.github/workflows/version_scanner.yml index aec449a54fe3..078e4259e491 100644 --- a/.github/workflows/version_scanner.yml +++ b/.github/workflows/version_scanner.yml @@ -35,7 +35,7 @@ jobs: # Uses -o to output a detailed, raw CSV to a file # Uses --stdout to print a slim, easier to parse summary to the GitHub Actions UI # Uses --soft-fail to temporarily limit causing CI/CD failures during the migration to full operation. - python scripts/version_scanner/version_scanner.py --matrix-file scripts/version_scanner/matrix.yaml --package-file scripts/version_scanner/python-310-package-list.txt --stdout -o version_scanner_output.csv --soft-fail + python scripts/version_scanner/version_scanner.py --matrix-file scripts/version_scanner/matrix.yaml --package-file scripts/version_scanner/example-list-non-generated-packages.txt --stdout -o version_scanner_output.csv --soft-fail - name: Upload CSV Results if: always() diff --git a/scripts/version_scanner/python-310-package-list.txt b/scripts/version_scanner/example-list-non-generated-packages.txt similarity index 100% rename from scripts/version_scanner/python-310-package-list.txt rename to scripts/version_scanner/example-list-non-generated-packages.txt diff --git a/scripts/version_scanner/small_package_list.txt b/scripts/version_scanner/small_package_list.txt deleted file mode 100644 index 8c9a4f39e879..000000000000 --- a/scripts/version_scanner/small_package_list.txt +++ /dev/null @@ -1,6 +0,0 @@ -# Example package list for filtering scanning targets via the --package-file option. -packages/google-cloud-access-context-manager -packages/google-cloud-bigtable -packages/google-cloud-biglake-hive -packages/google-cloud-documentai-toolbox -packages/google-cloud-core From 66eabc13ee0984c7963bb6393d28cfeb2680f1a2 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Tue, 23 Jun 2026 11:57:49 -0400 Subject: [PATCH 12/15] fix(version-scanner): address reviewer feedback regarding encoding, float versions, and argument validation --- .../tests/unit/test_version_scanner.py | 27 +++++++++++++++++++ scripts/version_scanner/version_scanner.py | 25 +++++++++-------- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 4d7fbebe9401..a151c1c55e11 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -59,6 +59,8 @@ def sample_match(): (PermissionError(), False, False, False, "Warning: Permission denied reading test_desc", None), # Optional PermissionError (IOError("disk full"), True, False, True, "Error reading test_desc", None), # Required IOError (IOError("disk full"), False, False, False, "Warning: Error reading test_desc", None), # Optional IOError + (ValueError("invalid bytes"), True, False, True, "Error reading test_desc", None), # Required ValueError + (ValueError("invalid bytes"), False, False, False, "Warning: Error reading test_desc", None), # Optional ValueError ] ) def test_safe_read_file_scenarios( @@ -801,6 +803,8 @@ def test_parse_matrix_file(tmp_path): ("invalid: {", True), # Invalid YAML ("- not_a_mapping", True), # Invalid structure (list instead of map) ("python:\n - null", True), # Invalid version type (null/None value) + ("python:\n - 3.10", True), # Invalid version type (float instead of string in list) + ("python: 3.10", True), # Invalid version type (float instead of string) ] ) def test_parse_matrix_file_failures(tmp_path, file_content, file_exists): @@ -868,3 +872,26 @@ def test_scan_repository_multi_targets(tmp_path): assert protobuf_match[0]["version"] == "4.25.8" assert protobuf_match[0]["rule_name"] == "protobuf_check" + +@pytest.mark.parametrize( + "args, expected_error_msg", + [ + # Mixing -m/--matrix-file with -d or -v + (['version_scanner.py', '-m', 'matrix.yaml', '-d', 'python'], "Cannot specify -d/--dependency or -v/--version when using -m/--matrix-file"), + (['version_scanner.py', '-m', 'matrix.yaml', '-v', '3.7'], "Cannot specify -d/--dependency or -v/--version when using -m/--matrix-file"), + (['version_scanner.py', '-m', 'matrix.yaml', '-d', 'python', '-v', '3.7'], "Cannot specify -d/--dependency or -v/--version when using -m/--matrix-file"), + # Missing either -d or -v when not using -m + (['version_scanner.py', '-d', 'python'], "Must specify both -d/--dependency and -v/--version when not using -m/--matrix-file"), + (['version_scanner.py', '-v', '3.7'], "Must specify both -d/--dependency and -v/--version when not using -m/--matrix-file"), + (['version_scanner.py'], "Must specify both -d/--dependency and -v/--version when not using -m/--matrix-file"), + ] +) +def test_main_cli_validation(capsys, args, expected_error_msg): + from version_scanner import main + with mock.patch('sys.argv', args): + with pytest.raises(SystemExit) as excinfo: + main() + assert excinfo.value.code == 2 + captured = capsys.readouterr() + assert expected_error_msg in captured.err + diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 5b7d2196c3a2..fcd45ed63b54 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -65,7 +65,7 @@ def _safe_read_file( else: print(f"Warning: Permission denied reading {description}: {file_path}", file=sys.stderr) return None - except IOError as e: + except (IOError, ValueError) as e: if required: print(f"Error reading {description} {file_path}: {e}", file=sys.stderr) sys.exit(1) @@ -646,11 +646,14 @@ def parse_matrix_file(file_path: str) -> List[Tuple[str, str]]: if v is None or isinstance(v, (dict, list)): print(f"Error: Invalid version '{v}' for dependency '{dep}'", file=sys.stderr) sys.exit(1) - targets.append((str(dep), str(v))) - elif versions is not None and not isinstance(versions, dict): - targets.append((str(dep), str(versions))) + if not isinstance(v, str): + print(f"Error: Version '{v}' for dependency '{dep}' must be specified as a quoted string to prevent YAML parsing issues (e.g., 3.10 parsed as 3.1).", file=sys.stderr) + sys.exit(1) + targets.append((str(dep), v)) + elif isinstance(versions, str): + targets.append((str(dep), versions)) else: - print(f"Error: Invalid version '{versions}' for dependency '{dep}'", file=sys.stderr) + print(f"Error: Invalid version '{versions}' for dependency '{dep}'. Versions must be specified as quoted strings.", file=sys.stderr) sys.exit(1) return targets @@ -743,13 +746,13 @@ def main(): args = parser.parse_args() # Validation of required inputs - has_single_target = bool(args.dependency and args.version) has_matrix_file = bool(args.matrix_file) - - if not (has_single_target or has_matrix_file): - parser.error("Must specify either (-d/--dependency AND -v/--version) OR (-m/--matrix-file)") - if has_single_target and has_matrix_file: - parser.error("Cannot specify both single target (-d/-v) and matrix file (-m/--matrix-file)") + if has_matrix_file: + if args.dependency or args.version: + parser.error("Cannot specify -d/--dependency or -v/--version when using -m/--matrix-file") + else: + if not (args.dependency and args.version): + parser.error("Must specify both -d/--dependency and -v/--version when not using -m/--matrix-file") targets = [] if has_matrix_file: From 055975c54c2c2e0d841070514f981b7f8ffda7fb Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Tue, 23 Jun 2026 10:25:56 -0400 Subject: [PATCH 13/15] feat(version-scanner): support globbing and subpath patterns in ignore file --- .../tests/unit/test_version_scanner.py | 47 +++++++++++++++++++ scripts/version_scanner/version_scanner.py | 42 +++++++++++++++-- 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index a151c1c55e11..16ed7e00bd9e 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -366,6 +366,53 @@ def test_scan_repository_ignores_version_scanner(tmp_path): assert len(results) == 0 +def test_scan_repository_wildcard_ignores(tmp_path): + # Create files + (tmp_path / "test.jpg").write_text("dummy version 3.7\n") + (tmp_path / "test.py").write_text("python_requires = '>=3.7'\n") + + rules = [ + {"name": "python_requires_check", "pattern": "python_requires\\s*=\\s*['\"]>=3\\.7['\"]"}, + {"name": "explicit_version_string", "pattern": "3\\.7"} + ] + + from version_scanner import scan_repository + # Without ignore + results = scan_repository(str(tmp_path), rules) + assert len(results) >= 2 + + # With wildcard ignore for *.jpg + results_ignored = scan_repository(str(tmp_path), rules, ignore_dirs=["*.jpg"]) + # test.jpg should be ignored completely + for match in results_ignored: + assert not match["file_path"].endswith("test.jpg") + + +def test__should_ignore(): + from version_scanner import _should_ignore + + ignore_patterns = [ + ".git", + "*.jpg", + "packages/pkg_a/.nox", + "*.egg-info" + ] + + # Exact match + assert _should_ignore(".git", ".git", ignore_patterns) is True + # Case insensitivity + assert _should_ignore(".GIT", ".GIT", ignore_patterns) is True + # Wildcard match + assert _should_ignore("some/path/image.jpg", "image.jpg", ignore_patterns) is True + assert _should_ignore("image.JPG", "image.JPG", ignore_patterns) is True + # Subpath match + assert _should_ignore("packages/pkg_a/.nox", ".nox", ignore_patterns) is True + # Wildcard directory match + assert _should_ignore("google_cloud_pubsub.egg-info", "google_cloud_pubsub.egg-info", ignore_patterns) is True + # Negative match + assert _should_ignore("setup.py", "setup.py", ignore_patterns) is False + + def test_load_ignore_file(tmp_path): from version_scanner import load_ignore_file diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index fcd45ed63b54..4cfafba9d6df 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -20,6 +20,7 @@ import argparse import csv import datetime +import fnmatch import os import re import sys @@ -498,6 +499,28 @@ def read_package_file(file_path: str) -> List[str]: return packages +def _should_ignore(rel_path: str, name: str, ignore_patterns: List[str]) -> bool: + """Check if a file or directory matches any of the ignore patterns.""" + if not ignore_patterns: + return False + name_lower = name.lower() + rel_path_norm = rel_path.replace(os.sep, '/').lower() + + for pattern in ignore_patterns: + pattern_lower = pattern.lower() + if '/' in pattern: + if pattern_lower.startswith('/'): + p = pattern_lower[1:] + else: + p = pattern_lower + if fnmatch.fnmatchcase(rel_path_norm, p) or fnmatch.fnmatchcase(rel_path_norm, f"*/{p}"): + return True + else: + if fnmatch.fnmatchcase(name_lower, pattern_lower): + return True + return False + + def scan_repository( root_path: str, rules: List[Dict[str, Any]], @@ -528,7 +551,6 @@ def scan_repository( Returns: A list of dictionaries detailing each match. """ - ignore_lower = {i.lower() for i in ignore_dirs} if ignore_dirs else set() results = [] filename_targets = [] @@ -557,13 +579,23 @@ def scan_repository( print(f"Filtering for packages: {target_packages}") for root, dirs, files in os.walk(root_path): + rel_root = os.path.relpath(root, root_path) + + # Helper to construct relative path for ignore matching + def get_rel_path(name): + return name if rel_root == "." else os.path.join(rel_root, name) + # Prune ignore directories (case-insensitive) - dirs[:] = [d for d in dirs if d.lower() not in ignore_lower] + dirs[:] = [ + d for d in dirs + if not _should_ignore(get_rel_path(d), d, ignore_dirs) + ] # Filter ignore files (case-insensitive) - files = [f for f in files if f.lower() not in ignore_lower] - - rel_root = os.path.relpath(root, root_path) + files = [ + f for f in files + if not _should_ignore(get_rel_path(f), f, ignore_dirs) + ] # Layout-agnostic generic subdirectory filtering if target_packages: From 8ca523c6efbb89c55315e6fa4fa8f3c75407f7e0 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 24 Jun 2026 13:47:59 -0400 Subject: [PATCH 14/15] feat(version-scanner): fix root anchoring in ignores, parametrize tests, and update docs --- scripts/version_scanner/README.md | 28 +++++++++++-- .../tests/unit/test_version_scanner.py | 40 +++++++++---------- scripts/version_scanner/version_scanner.py | 20 ++++++++-- 3 files changed, 59 insertions(+), 29 deletions(-) diff --git a/scripts/version_scanner/README.md b/scripts/version_scanner/README.md index 1978e42f55cc..443470ed08ae 100644 --- a/scripts/version_scanner/README.md +++ b/scripts/version_scanner/README.md @@ -45,12 +45,32 @@ pip install -r scripts/version_scanner/requirements.txt The scanner uses a YAML configuration file (`regex_config.yaml`) to define rules and regex patterns. -## Ignoring Directories +## Matrix File Format -You can create a `.scannerignore` file in the directory you are scanning (usually the repo root) to list directories to skip, one per line. +When using `--matrix-file`, you must provide a YAML file specifying dependencies and versions. -## Known Issues & Future Investigations -- **Binary Ignores in `.scannerignore`**: Recursive wildcard ignores (e.g., `*.jpg`) currently do not effectively ignore deeply nested binary files. The scanner logic should be investigated to support robust globbing or full-path suffix matching. +### Example +```yaml +python: + - "3.10" + - "3.11" +protobuf: "4.25.8" +``` + +> [!IMPORTANT] +> **Versions must be specified as quoted strings** (e.g., `"3.10"`, not `3.10`). This prevents YAML parsers from converting them to floats (which would truncate `3.10` to `3.1`). + +## Ignoring Directories and Files + +In order to ignore files OR entire directories, you can add ignore patterns to the `.scannerignore` file located in the same directory as the script (`scripts/version_scanner/.scannerignore`). Ignore patterns should be added one per line. + +### Features +- **Case-insensitive**: All patterns are matched case-insensitively. +- **Globbing**: Supports standard shell globbing patterns (e.g., `*.jpg`, `test_*`). +- **Subpaths**: You can specify subpaths (e.g., `packages/pkg_a/.nox`). +- **Root Anchoring**: Patterns starting with a slash `/` are anchored to the root of the scan (e.g., `/packages` ignores the `packages` directory at root, but not `some/other/packages`). + +--- --- diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 16ed7e00bd9e..faf6e7292da0 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -388,29 +388,25 @@ def test_scan_repository_wildcard_ignores(tmp_path): assert not match["file_path"].endswith("test.jpg") -def test__should_ignore(): - from version_scanner import _should_ignore - - ignore_patterns = [ - ".git", - "*.jpg", - "packages/pkg_a/.nox", - "*.egg-info" +DEFAULT_IGNORE_PATTERNS = [".git", "*.jpg", "packages/pkg_a/.nox", "*.egg-info"] + +@pytest.mark.parametrize( + "rel_path, name, ignore_patterns, expected", + [ + pytest.param(".git", ".git", DEFAULT_IGNORE_PATTERNS, True, id="exact_match"), + pytest.param(".GIT", ".GIT", DEFAULT_IGNORE_PATTERNS, True, id="case_insensitive_match"), + pytest.param("some/path/image.jpg", "image.jpg", DEFAULT_IGNORE_PATTERNS, True, id="wildcard_subpath_match"), + pytest.param("image.JPG", "image.JPG", DEFAULT_IGNORE_PATTERNS, True, id="wildcard_case_insensitive_match"), + pytest.param("packages/pkg_a/.nox", ".nox", DEFAULT_IGNORE_PATTERNS, True, id="subpath_exact_match"), + pytest.param("google_cloud_pubsub.egg-info", "google_cloud_pubsub.egg-info", DEFAULT_IGNORE_PATTERNS, True, id="wildcard_directory_match"), + pytest.param("setup.py", "setup.py", DEFAULT_IGNORE_PATTERNS, False, id="no_match"), + pytest.param("packages", "packages", ["/packages"], True, id="anchored_root_match"), + pytest.param("some/other/packages", "packages", ["/packages"], False, id="anchored_root_nested_no_match"), ] - - # Exact match - assert _should_ignore(".git", ".git", ignore_patterns) is True - # Case insensitivity - assert _should_ignore(".GIT", ".GIT", ignore_patterns) is True - # Wildcard match - assert _should_ignore("some/path/image.jpg", "image.jpg", ignore_patterns) is True - assert _should_ignore("image.JPG", "image.JPG", ignore_patterns) is True - # Subpath match - assert _should_ignore("packages/pkg_a/.nox", ".nox", ignore_patterns) is True - # Wildcard directory match - assert _should_ignore("google_cloud_pubsub.egg-info", "google_cloud_pubsub.egg-info", ignore_patterns) is True - # Negative match - assert _should_ignore("setup.py", "setup.py", ignore_patterns) is False +) +def test__should_ignore(rel_path, name, ignore_patterns, expected): + from version_scanner import _should_ignore + assert _should_ignore(rel_path, name, ignore_patterns) is expected def test_load_ignore_file(tmp_path): diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 4cfafba9d6df..11b4a6107bff 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -500,7 +500,19 @@ def read_package_file(file_path: str) -> List[str]: def _should_ignore(rel_path: str, name: str, ignore_patterns: List[str]) -> bool: - """Check if a file or directory matches any of the ignore patterns.""" + """Check if a file or directory matches any of the ignore patterns. + + Directories and files can be ignored by providing an ignore pattern in the + .scannerignore file. + + Args: + rel_path: The relative path of the file or directory from the scan root. + name: The name of the file or directory (basename). + ignore_patterns: A list of ignore patterns (glob-like or subpaths). + + Returns: + True if the file or directory should be ignored, False otherwise. + """ if not ignore_patterns: return False name_lower = name.lower() @@ -511,10 +523,12 @@ def _should_ignore(rel_path: str, name: str, ignore_patterns: List[str]) -> bool if '/' in pattern: if pattern_lower.startswith('/'): p = pattern_lower[1:] + if fnmatch.fnmatchcase(rel_path_norm, p): + return True else: p = pattern_lower - if fnmatch.fnmatchcase(rel_path_norm, p) or fnmatch.fnmatchcase(rel_path_norm, f"*/{p}"): - return True + if fnmatch.fnmatchcase(rel_path_norm, p) or fnmatch.fnmatchcase(rel_path_norm, f"*/{p}"): + return True else: if fnmatch.fnmatchcase(name_lower, pattern_lower): return True From d2d2d6c906f3027c49bbc3bd5ee4f21404579f71 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 24 Jun 2026 14:52:33 -0400 Subject: [PATCH 15/15] perf(version-scanner): optimize ignore logic and exclude caches/noise --- scripts/version_scanner/.scannerignore | 11 ++++ .../tests/unit/test_version_scanner.py | 5 +- scripts/version_scanner/version_scanner.py | 60 +++++++++++++------ 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/scripts/version_scanner/.scannerignore b/scripts/version_scanner/.scannerignore index e200930894d0..aa7406addb41 100644 --- a/scripts/version_scanner/.scannerignore +++ b/scripts/version_scanner/.scannerignore @@ -20,3 +20,14 @@ repositories.bzl *.png *.gif *.ico +*.pdf + +# Ignore caches and temporary directories +.ruff_cache +.pytest_cache +.mypy_cache +.coverage +.htmlcov + +# Ignore data files +*.csv diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index faf6e7292da0..9937da213497 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -405,8 +405,9 @@ def test_scan_repository_wildcard_ignores(tmp_path): ] ) def test__should_ignore(rel_path, name, ignore_patterns, expected): - from version_scanner import _should_ignore - assert _should_ignore(rel_path, name, ignore_patterns) is expected + from version_scanner import _should_ignore, _preprocess_ignore_patterns + preprocessed = _preprocess_ignore_patterns(ignore_patterns) + assert _should_ignore(rel_path, name, preprocessed) is expected def test_load_ignore_file(tmp_path): diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index 11b4a6107bff..86bad23c3e28 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -499,7 +499,32 @@ def read_package_file(file_path: str) -> List[str]: return packages -def _should_ignore(rel_path: str, name: str, ignore_patterns: List[str]) -> bool: +def _preprocess_ignore_patterns(ignore_patterns: List[str]) -> List[Tuple[str, str]]: + """Preprocesses ignore patterns into a classified list for faster matching. + + Args: + ignore_patterns: A list of raw ignore patterns from .scannerignore. + + Returns: + A list of tuples (type, pattern) where type is 'anchored', 'subpath', or 'filename'. + """ + if not ignore_patterns: + return [] + + preprocessed = [] + for pattern in ignore_patterns: + pattern_lower = pattern.lower() + if '/' in pattern: + if pattern_lower.startswith('/'): + preprocessed.append(('anchored', pattern_lower[1:])) + else: + preprocessed.append(('subpath', pattern_lower)) + else: + preprocessed.append(('filename', pattern_lower)) + return preprocessed + + +def _should_ignore(rel_path: str, name: str, preprocessed_patterns: List[Tuple[str, str]]) -> bool: """Check if a file or directory matches any of the ignore patterns. Directories and files can be ignored by providing an ignore pattern in the @@ -508,29 +533,25 @@ def _should_ignore(rel_path: str, name: str, ignore_patterns: List[str]) -> bool Args: rel_path: The relative path of the file or directory from the scan root. name: The name of the file or directory (basename). - ignore_patterns: A list of ignore patterns (glob-like or subpaths). + preprocessed_patterns: A list of preprocessed ignore patterns. Returns: True if the file or directory should be ignored, False otherwise. """ - if not ignore_patterns: + if not preprocessed_patterns: return False name_lower = name.lower() rel_path_norm = rel_path.replace(os.sep, '/').lower() - for pattern in ignore_patterns: - pattern_lower = pattern.lower() - if '/' in pattern: - if pattern_lower.startswith('/'): - p = pattern_lower[1:] - if fnmatch.fnmatchcase(rel_path_norm, p): - return True - else: - p = pattern_lower - if fnmatch.fnmatchcase(rel_path_norm, p) or fnmatch.fnmatchcase(rel_path_norm, f"*/{p}"): - return True - else: - if fnmatch.fnmatchcase(name_lower, pattern_lower): + for p_type, p_val in preprocessed_patterns: + if p_type == 'anchored': + if fnmatch.fnmatchcase(rel_path_norm, p_val): + return True + elif p_type == 'subpath': + if fnmatch.fnmatchcase(rel_path_norm, p_val) or fnmatch.fnmatchcase(rel_path_norm, f"*/{p_val}"): + return True + elif p_type == 'filename': + if fnmatch.fnmatchcase(name_lower, p_val): return True return False @@ -588,6 +609,9 @@ def scan_repository( print(f"Error compiling regex for rule {rule['name']}: {e}", file=sys.stderr) continue + # Preprocess ignore patterns once + preprocessed_ignores = _preprocess_ignore_patterns(ignore_dirs) + print(f"\nScanning repository: {root_path}") if target_packages: print(f"Filtering for packages: {target_packages}") @@ -602,13 +626,13 @@ def get_rel_path(name): # Prune ignore directories (case-insensitive) dirs[:] = [ d for d in dirs - if not _should_ignore(get_rel_path(d), d, ignore_dirs) + if not _should_ignore(get_rel_path(d), d, preprocessed_ignores) ] # Filter ignore files (case-insensitive) files = [ f for f in files - if not _should_ignore(get_rel_path(f), f, ignore_dirs) + if not _should_ignore(get_rel_path(f), f, preprocessed_ignores) ] # Layout-agnostic generic subdirectory filtering