From a4d807b72cdd69348977cdafd8a499520fc88ab2 Mon Sep 17 00:00:00 2001 From: Michael Fowlie Date: Fri, 10 Apr 2026 12:38:45 +1000 Subject: [PATCH] fix: reuse pre-scan plan for interfile scans --- cli/src/semgrep/core_runner.py | 21 ++- cli/src/semgrep/run_scan.py | 40 +++++ .../default/unit/test_interfile_scan_plan.py | 169 ++++++++++++++++++ 3 files changed, 222 insertions(+), 8 deletions(-) create mode 100644 cli/tests/default/unit/test_interfile_scan_plan.py diff --git a/cli/src/semgrep/core_runner.py b/cli/src/semgrep/core_runner.py index a1b9bd9a0..12fb561c8 100644 --- a/cli/src/semgrep/core_runner.py +++ b/cli/src/semgrep/core_runner.py @@ -793,6 +793,7 @@ def _run_rules_direct_to_semgrep_core_helper( sca_subprojects: Dict[out.Ecosystem, List[ResolvedSubproject]], opengrep_ignore_pattern: Optional[str], bypass_includes_excludes_for_files: bool = True, + precomputed_plan: Optional[Plan] = None, inline_metavariables: bool = False, max_match_per_file: Optional[int] = None, allow_rule_timeout_control: bool = DEFAULT_ALLOW_RULE_TIMEOUT_CONTROL, @@ -900,18 +901,18 @@ def _run_rules_direct_to_semgrep_core_helper( # commits as "diff targets". To compile a comprehensive list of all input files # for `plan`, the `baseline_handler` is disabled within the `target_manager` # when executing `plan_core_run`. - plan = self.plan_core_run( - rules, - evolve(target_manager, baseline_handler=None), - all_targets=all_targets, - sca_subprojects=sca_subprojects, - bypass_includes_excludes_for_files=bypass_includes_excludes_for_files - ) + plan_target_manager = evolve(target_manager, baseline_handler=None) + + else: + plan_target_manager = target_manager + if precomputed_plan is not None: + plan = precomputed_plan + all_targets.update(Path(task.path) for task in plan.target_mappings) else: plan = self.plan_core_run( rules, - target_manager, + plan_target_manager, all_targets=all_targets, sca_subprojects=sca_subprojects, bypass_includes_excludes_for_files=bypass_includes_excludes_for_files @@ -1134,6 +1135,7 @@ def _run_rules_direct_to_semgrep_core( sca_subprojects: Dict[out.Ecosystem, List[ResolvedSubproject]], opengrep_ignore_pattern: Optional[str] = None, bypass_includes_excludes_for_files: bool = True, + precomputed_plan: Optional[Plan] = None, inline_metavariables: bool = False, max_match_per_file: Optional[int] = None, allow_rule_timeout_control: bool = DEFAULT_ALLOW_RULE_TIMEOUT_CONTROL, @@ -1165,6 +1167,7 @@ def _run_rules_direct_to_semgrep_core( sca_subprojects, opengrep_ignore_pattern=opengrep_ignore_pattern, bypass_includes_excludes_for_files=bypass_includes_excludes_for_files, + precomputed_plan=precomputed_plan, inline_metavariables=inline_metavariables, max_match_per_file=max_match_per_file, allow_rule_timeout_control=allow_rule_timeout_control, @@ -1214,6 +1217,7 @@ def invoke_semgrep_core( sca_subprojects: Dict[out.Ecosystem, List[ResolvedSubproject]], opengrep_ignore_pattern: Optional[str] = None, bypass_includes_excludes_for_files: bool = True, + precomputed_plan: Optional[Plan] = None, inline_metavariables: bool = False, max_match_per_file: Optional[int] = None, allow_rule_timeout_control: bool = DEFAULT_ALLOW_RULE_TIMEOUT_CONTROL, @@ -1245,6 +1249,7 @@ def invoke_semgrep_core( sca_subprojects, opengrep_ignore_pattern=opengrep_ignore_pattern, bypass_includes_excludes_for_files=bypass_includes_excludes_for_files, + precomputed_plan=precomputed_plan, inline_metavariables = inline_metavariables, max_match_per_file=max_match_per_file, allow_rule_timeout_control=allow_rule_timeout_control, diff --git a/cli/src/semgrep/run_scan.py b/cli/src/semgrep/run_scan.py index 1b2171e68..088f96ec2 100644 --- a/cli/src/semgrep/run_scan.py +++ b/cli/src/semgrep/run_scan.py @@ -243,6 +243,39 @@ def filter_dependency_aware_rules( return filtered_rules +def get_reusable_interfile_scan_plan( + engine_type: EngineType, + target_mode_config: TargetModeConfig, + rules_for_core: Sequence[Rule], + scan_plans: Sequence[Plan], +) -> Optional[Plan]: + """ + Reuse the already-built pre-scan SAST plan when it exactly matches the + rule set that will be executed by the interfile engine. + + This keeps the optimization narrow and reversible: + - only interfile scans use it + - differential and historical scans keep their existing plan-building path + - any rule-set mismatch falls back to rebuilding the execution plan + """ + if ( + not engine_type.is_interfile + or target_mode_config.is_pro_diff_scan + or target_mode_config.is_historical_scan + or not scan_plans + ): + return None + + sast_plan = scan_plans[0] + pre_scan_rule_ids = tuple(rule.id for rule in sast_plan.rules) + core_rule_ids = tuple(rule.id for rule in rules_for_core) + + if pre_scan_rule_ids != core_rule_ids: + return None + + return sast_plan + + # This runs semgrep-core (and also handles SCA and join rules) @tracing.trace() def run_rules( @@ -353,6 +386,12 @@ def run_rules( with_code_rules=with_code_rules, with_supply_chain=with_supply_chain, ) + reusable_interfile_plan = get_reusable_interfile_scan_plan( + engine_type, + target_mode_config, + rest_of_the_rules, + plans, + ) # Dispatching to semgrep-core! ( @@ -373,6 +412,7 @@ def run_rules( resolved_subprojects, opengrep_ignore_pattern=opengrep_ignore_pattern, bypass_includes_excludes_for_files=bypass_includes_excludes_for_files, + precomputed_plan=reusable_interfile_plan, inline_metavariables=inline_metavariables, max_match_per_file=max_match_per_file, allow_rule_timeout_control=allow_rule_timeout_control, diff --git a/cli/tests/default/unit/test_interfile_scan_plan.py b/cli/tests/default/unit/test_interfile_scan_plan.py new file mode 100644 index 000000000..e0b105022 --- /dev/null +++ b/cli/tests/default/unit/test_interfile_scan_plan.py @@ -0,0 +1,169 @@ +from pathlib import Path + +import pytest + +import semgrep.semgrep_interfaces.semgrep_output_v1 as out +from semgrep.core_runner import CoreRunner +from semgrep.core_targets_plan import Plan +from semgrep.core_targets_plan import Task +from semgrep.engine import EngineType +from semgrep.rule import Rule +from semgrep.run_scan import get_reusable_interfile_scan_plan +from semgrep.run_scan import run_rules +from semgrep.target_mode import TargetModeConfig + + +def make_rule(rule_id: str) -> Rule: + return Rule( + { + "id": rule_id, + "languages": ["python"], + "message": "test message", + "severity": "INFO", + "pattern": "print(...)", + } + ) + + +def make_sast_plan(rule: Rule, path: str = "foo.py") -> Plan: + return Plan( + [ + Task( + path=path, + analyzer=rule.languages[0], + products=(out.Product(out.SAST()),), + rule_nums=(0,), + ) + ], + [rule], + product=out.Product(out.SAST()), + sca_subprojects={}, + ) + + +@pytest.mark.quick +def test_get_reusable_interfile_scan_plan_returns_matching_sast_plan() -> None: + rule = make_rule("rule.matching") + sast_plan = make_sast_plan(rule) + + reusable_plan = get_reusable_interfile_scan_plan( + EngineType.PRO_INTERFILE, + TargetModeConfig.whole_scan(), + [rule], + [sast_plan], + ) + + assert reusable_plan is sast_plan + + +@pytest.mark.quick +def test_get_reusable_interfile_scan_plan_rejects_diff_scans_and_rule_mismatches() -> None: + matching_rule = make_rule("rule.matching") + other_rule = make_rule("rule.other") + sast_plan = make_sast_plan(matching_rule) + + diff_plan = get_reusable_interfile_scan_plan( + EngineType.PRO_INTERFILE, + TargetModeConfig.pro_diff_scan(frozenset({Path("foo.py")}), diff_depth=2), + [matching_rule], + [sast_plan], + ) + mismatch_plan = get_reusable_interfile_scan_plan( + EngineType.PRO_INTERFILE, + TargetModeConfig.whole_scan(), + [other_rule], + [sast_plan], + ) + + assert diff_plan is None + assert mismatch_plan is None + + +@pytest.mark.quick +def test_run_rules_passes_reusable_interfile_plan_to_core_runner(mocker) -> None: + rule = make_rule("rule.matching") + sast_plan = make_sast_plan(rule) + sca_plan = Plan([], [], product=out.Product(out.SCA()), sca_subprojects={}) + mock_output_extra = mocker.Mock() + mock_output_extra.all_targets = set() + + core_runner = mocker.Mock() + core_runner.invoke_semgrep_core.return_value = ({}, [], mock_output_extra) + output_handler = mocker.Mock() + target_manager = mocker.Mock() + + mocker.patch( + "semgrep.run_scan.scan_report.print_scan_status", + return_value=[sast_plan, sca_plan], + ) + + run_rules( + [rule], + target_manager, + core_runner, + output_handler, + dump_command_for_core=False, + time_flag=False, + matching_explanations=False, + engine_type=EngineType.PRO_INTERFILE, + strict=False, + target_mode_config=TargetModeConfig.whole_scan(), + ) + + assert ( + core_runner.invoke_semgrep_core.call_args.kwargs["precomputed_plan"] + is sast_plan + ) + + +@pytest.mark.quick +def test_core_runner_uses_precomputed_plan_without_replanning(mocker, tmp_path) -> None: + rule = make_rule("rule.matching") + precomputed_plan = make_sast_plan(rule) + mock_state = mocker.Mock() + mock_state.env.user_data_folder = tmp_path + mock_state.get_cli_ux_flavor.return_value = object() + mock_state.metrics = mocker.Mock() + mock_state.terminal.is_debug = False + + mocker.patch("semgrep.core_runner.get_state", return_value=mock_state) + mocker.patch( + "semgrep.engine.EngineType.get_binary_path", + return_value=Path("/tmp/opengrep-core"), + ) + mocker.patch.object( + CoreRunner, + "plan_core_run", + side_effect=AssertionError("plan_core_run should not be called"), + ) + + runner = CoreRunner( + jobs=1, + engine_type=EngineType.OSS, + timeout=1, + max_memory=0, + timeout_threshold=0, + interfile_timeout=0, + capture_stderr=False, + optimizations="none", + allow_untrusted_validators=False, + ) + + with pytest.raises(SystemExit) as excinfo: + runner._run_rules_direct_to_semgrep_core_helper( + [rule], + mocker.Mock(), + dump_command_for_core=True, + time_flag=False, + matching_explanations=False, + engine=EngineType.OSS, + strict=False, + run_secrets=False, + disable_secrets_validation=False, + target_mode_config=TargetModeConfig.whole_scan(), + sca_subprojects={}, + opengrep_ignore_pattern=None, + precomputed_plan=precomputed_plan, + ) + + assert excinfo.value.code == 0