diff --git a/cli/src/semgrep/core_runner.py b/cli/src/semgrep/core_runner.py index a1b9bd9a0..7fc19894d 100644 --- a/cli/src/semgrep/core_runner.py +++ b/cli/src/semgrep/core_runner.py @@ -7,6 +7,7 @@ import tempfile from datetime import datetime from pathlib import Path +from weakref import WeakKeyDictionary from typing import Any from typing import Callable from typing import cast @@ -19,6 +20,7 @@ from typing import Tuple from attr import evolve +from attr import frozen from rich.progress import BarColumn from rich.progress import Progress from rich.progress import TaskID @@ -79,6 +81,47 @@ import resource +RuleTargetingSignature = Tuple[ + Tuple[str, ...], + Tuple[str, ...], + Tuple[str, ...], + str, + str, +] +PlanBundleCacheKey = Tuple[bool, Tuple[RuleTargetingSignature, ...]] + + +@frozen +class PlanBundle: + tasks: Tuple[Task, ...] + unused_rule_nums: Tuple[int, ...] + + +# Reuse the bundled rule-to-target plan inside a single CLI invocation. This is +# especially useful when the scan status path warms the bundle before the actual +# engine run, because the interfile scan can then reuse it entirely from memory. +_PLAN_BUNDLE_CACHE: "WeakKeyDictionary[TargetManager, Dict[PlanBundleCacheKey, PlanBundle]]" = WeakKeyDictionary() + + +def _rule_targeting_signature(rule: Rule) -> RuleTargetingSignature: + return ( + tuple(str(language) for language in rule.languages), + tuple(rule.includes), + tuple(rule.excludes), + rule.id, + rule.product.to_json_string(), + ) + + +def _plan_bundle_cache_key( + rules: Sequence[Rule], bypass_includes_excludes_for_files: bool +) -> PlanBundleCacheKey: + return ( + bypass_includes_excludes_for_files, + tuple(_rule_targeting_signature(rule) for rule in rules), + ) + + def setrlimits_preexec_fn() -> None: """ Sets stack limit of current running process to the maximum possible @@ -731,46 +774,68 @@ def plan_core_run( Note: this is a list because a target can appear twice (e.g. Java + Generic) """ - # The range of target_info is (index into rules x product as json) - # Using product as JSON because we want structural equality of products instead of object equality. - target_info: Dict[ - Tuple[Path, Language], Tuple[List[int], Set[str]] - ] = collections.defaultdict(lambda: (list(), set())) - - unused_rules = [] - - for rule_num, rule in enumerate(rules): - any_target = False - for language in rule.languages: - targets = list( - target_manager.get_files_for_rule( - language, rule.includes, rule.excludes, rule.id, rule.product, - bypass_includes_excludes_for_files=bypass_includes_excludes_for_files, + cache_key = _plan_bundle_cache_key( + rules, bypass_includes_excludes_for_files + ) + bundle_cache = _PLAN_BUNDLE_CACHE.setdefault(target_manager, {}) + bundle = bundle_cache.get(cache_key) + + if bundle is None: + # The range of target_info is (index into rules x product as json) + # Using product as JSON because we want structural equality of products instead of object equality. + target_info: Dict[ + Tuple[Path, Language], Tuple[List[int], Set[str]] + ] = collections.defaultdict(lambda: (list(), set())) + + unused_rule_nums: List[int] = [] + + for rule_num, rule in enumerate(rules): + any_target = False + for language in rule.languages: + targets = list( + target_manager.get_files_for_rule( + language, + rule.includes, + rule.excludes, + rule.id, + rule.product, + bypass_includes_excludes_for_files=bypass_includes_excludes_for_files, + ) ) - ) - any_target = any_target or len(targets) > 0 + any_target = any_target or len(targets) > 0 + + for target in targets: + rules_nums, products = target_info[target, language] + rules_nums.append(rule_num) + products.add(rule.product.to_json_string()) + + if not any_target: + unused_rule_nums.append(rule_num) + + bundle = PlanBundle( + tasks=tuple( + Task( + path=target, + analyzer=language, + products=tuple( + out.Product.from_json_string(x) for x in products + ), + # tuple conversion makes rule_nums hashable, so usable as cache key + rule_nums=tuple(rule_nums), + ) + for ((target, language), (rule_nums, products)) in target_info.items() + ), + unused_rule_nums=tuple(unused_rule_nums), + ) + bundle_cache[cache_key] = bundle - for target in targets: - if all_targets is not None: - all_targets.add(target) - rules_nums, products = target_info[target, language] - rules_nums.append(rule_num) - products.add(rule.product.to_json_string()) + if all_targets is not None: + all_targets.update(Path(task.path) for task in bundle.tasks) - if not any_target: - unused_rules.append(rule) + unused_rules = [rules[rule_num] for rule_num in bundle.unused_rule_nums] return Plan( - [ - Task( - path=target, - analyzer=language, - products=tuple(out.Product.from_json_string(x) for x in products), - # tuple conversion makes rule_nums hashable, so usable as cache key - rule_nums=tuple(rule_nums), - ) - for ((target, language), (rule_nums, products)) in target_info.items() - ], + list(bundle.tasks), rules, product=product, sca_subprojects=sca_subprojects, diff --git a/cli/src/semgrep/run_scan.py b/cli/src/semgrep/run_scan.py index 1b2171e68..25f44ee06 100644 --- a/cli/src/semgrep/run_scan.py +++ b/cli/src/semgrep/run_scan.py @@ -352,6 +352,7 @@ def run_rules( cli_ux=cli_ux, with_code_rules=with_code_rules, with_supply_chain=with_supply_chain, + bypass_includes_excludes_for_files=bypass_includes_excludes_for_files, ) # Dispatching to semgrep-core! diff --git a/cli/tests/default/unit/test_core_runner.py b/cli/tests/default/unit/test_core_runner.py new file mode 100644 index 000000000..89bec27b7 --- /dev/null +++ b/cli/tests/default/unit/test_core_runner.py @@ -0,0 +1,168 @@ +from types import SimpleNamespace + +import pytest + +from semgrep.core_runner import CoreRunner +from semgrep.engine import EngineType +from semgrep.rule import Rule +from semgrep.run_scan import run_rules +from semgrep.target_manager import SAST_PRODUCT +from semgrep.target_manager import TargetManager +from semgrep.target_mode import TargetModeConfig + + +def make_rule() -> Rule: + return Rule( + { + "id": "test.rule", + "pattern": "sink(...)", + "languages": ["python"], + "message": "test", + "severity": "INFO", + } + ) + + +@pytest.mark.quick +def test_plan_core_run_reuses_bundle_from_memory(tmp_path, monkeypatch): + (tmp_path / "app.py").write_text("print('hello')\n") + target_manager = TargetManager([str(tmp_path)]) + rules = [make_rule()] + + original_get_files_for_rule = TargetManager.get_files_for_rule + get_files_for_rule_calls = 0 + + def counting_get_files_for_rule(self, *args, **kwargs): + nonlocal get_files_for_rule_calls + get_files_for_rule_calls += 1 + return original_get_files_for_rule(self, *args, **kwargs) + + monkeypatch.setattr(TargetManager, "get_files_for_rule", counting_get_files_for_rule) + + first_plan = CoreRunner.plan_core_run( + rules, + target_manager, + sca_subprojects={}, + ) + second_plan = CoreRunner.plan_core_run( + rules, + target_manager, + sca_subprojects={}, + ) + + assert first_plan.target_mappings == second_plan.target_mappings + assert get_files_for_rule_calls == 1 + + +@pytest.mark.quick +def test_plan_core_run_cache_key_includes_explicit_file_bypass(tmp_path): + app_file = tmp_path / "app.py" + app_file.write_text("print('hello')\n") + target_manager = TargetManager( + [str(app_file)], + excludes={SAST_PRODUCT: ["app.py"]}, + ) + rules = [make_rule()] + + plan_with_bypass = CoreRunner.plan_core_run( + rules, + target_manager, + sca_subprojects={}, + bypass_includes_excludes_for_files=True, + ) + plan_without_bypass = CoreRunner.plan_core_run( + rules, + target_manager, + sca_subprojects={}, + bypass_includes_excludes_for_files=False, + ) + + assert plan_with_bypass.num_targets == 1 + assert plan_without_bypass.num_targets == 0 + + +@pytest.mark.quick +def test_run_rules_reuses_pre_scan_bundle_for_interfile_scan(tmp_path, monkeypatch): + (tmp_path / "app.py").write_text("print('hello')\n") + target_manager = TargetManager([str(tmp_path)]) + rules = [make_rule()] + + original_get_files_for_rule = TargetManager.get_files_for_rule + get_files_for_rule_calls = 0 + + def counting_get_files_for_rule(self, *args, **kwargs): + nonlocal get_files_for_rule_calls + get_files_for_rule_calls += 1 + return original_get_files_for_rule(self, *args, **kwargs) + + monkeypatch.setattr(TargetManager, "get_files_for_rule", counting_get_files_for_rule) + + def fake_print_scan_status( + rules, + target_manager, + target_mode_config, + sca_subprojects, + dependency_parser_errors, + *, + bypass_includes_excludes_for_files=True, + **_, + ): + del target_mode_config, dependency_parser_errors + return [ + CoreRunner.plan_core_run( + list(rules), + target_manager, + sca_subprojects=sca_subprojects, + bypass_includes_excludes_for_files=bypass_includes_excludes_for_files, + ) + ] + + monkeypatch.setattr( + "semgrep.scan_report.print_scan_status", + fake_print_scan_status, + ) + + class FakeCoreRunner: + def invoke_semgrep_core( + self, + target_manager, + rules, + _dump_command_for_core, + _time_flag, + _matching_explanations, + _engine_type, + _strict, + _run_secrets, + _disable_secrets_validation, + target_mode_config, + resolved_subprojects, + *, + bypass_includes_excludes_for_files=True, + **_, + ): + assert not target_mode_config.is_pro_diff_scan + plan = CoreRunner.plan_core_run( + list(rules), + target_manager, + sca_subprojects=resolved_subprojects, + bypass_includes_excludes_for_files=bypass_includes_excludes_for_files, + ) + assert plan.num_targets == 1 + return {}, [], SimpleNamespace() + + run_rules( + rules, + target_manager, + FakeCoreRunner(), + SimpleNamespace(handle_semgrep_errors=lambda errors: None), + dump_command_for_core=False, + time_flag=False, + matching_explanations=False, + engine_type=EngineType.PRO_INTERFILE, + strict=False, + run_secrets=False, + disable_secrets_validation=False, + target_mode_config=TargetModeConfig.whole_scan(), + ) + + assert get_files_for_rule_calls == 1