Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions cli/src/semgrep/core_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,7 @@ def _run_rules_direct_to_semgrep_core_helper(
sca_subprojects: Dict[out.Ecosystem, List[ResolvedSubproject]],
opengrep_ignore_pattern: Optional[str],
bypass_includes_excludes_for_files: bool = True,
precomputed_plan: Optional[Plan] = None,
inline_metavariables: bool = False,
max_match_per_file: Optional[int] = None,
allow_rule_timeout_control: bool = DEFAULT_ALLOW_RULE_TIMEOUT_CONTROL,
Expand Down Expand Up @@ -900,18 +901,18 @@ def _run_rules_direct_to_semgrep_core_helper(
# commits as "diff targets". To compile a comprehensive list of all input files
# for `plan`, the `baseline_handler` is disabled within the `target_manager`
# when executing `plan_core_run`.
plan = self.plan_core_run(
rules,
evolve(target_manager, baseline_handler=None),
all_targets=all_targets,
sca_subprojects=sca_subprojects,
bypass_includes_excludes_for_files=bypass_includes_excludes_for_files
)
plan_target_manager = evolve(target_manager, baseline_handler=None)

else:
plan_target_manager = target_manager

if precomputed_plan is not None:
plan = precomputed_plan
all_targets.update(Path(task.path) for task in plan.target_mappings)
else:
plan = self.plan_core_run(
rules,
target_manager,
plan_target_manager,
all_targets=all_targets,
sca_subprojects=sca_subprojects,
bypass_includes_excludes_for_files=bypass_includes_excludes_for_files
Expand Down Expand Up @@ -1134,6 +1135,7 @@ def _run_rules_direct_to_semgrep_core(
sca_subprojects: Dict[out.Ecosystem, List[ResolvedSubproject]],
opengrep_ignore_pattern: Optional[str] = None,
bypass_includes_excludes_for_files: bool = True,
precomputed_plan: Optional[Plan] = None,
inline_metavariables: bool = False,
max_match_per_file: Optional[int] = None,
allow_rule_timeout_control: bool = DEFAULT_ALLOW_RULE_TIMEOUT_CONTROL,
Expand Down Expand Up @@ -1165,6 +1167,7 @@ def _run_rules_direct_to_semgrep_core(
sca_subprojects,
opengrep_ignore_pattern=opengrep_ignore_pattern,
bypass_includes_excludes_for_files=bypass_includes_excludes_for_files,
precomputed_plan=precomputed_plan,
inline_metavariables=inline_metavariables,
max_match_per_file=max_match_per_file,
allow_rule_timeout_control=allow_rule_timeout_control,
Expand Down Expand Up @@ -1214,6 +1217,7 @@ def invoke_semgrep_core(
sca_subprojects: Dict[out.Ecosystem, List[ResolvedSubproject]],
opengrep_ignore_pattern: Optional[str] = None,
bypass_includes_excludes_for_files: bool = True,
precomputed_plan: Optional[Plan] = None,
inline_metavariables: bool = False,
max_match_per_file: Optional[int] = None,
allow_rule_timeout_control: bool = DEFAULT_ALLOW_RULE_TIMEOUT_CONTROL,
Expand Down Expand Up @@ -1245,6 +1249,7 @@ def invoke_semgrep_core(
sca_subprojects,
opengrep_ignore_pattern=opengrep_ignore_pattern,
bypass_includes_excludes_for_files=bypass_includes_excludes_for_files,
precomputed_plan=precomputed_plan,
inline_metavariables = inline_metavariables,
max_match_per_file=max_match_per_file,
allow_rule_timeout_control=allow_rule_timeout_control,
Expand Down
40 changes: 40 additions & 0 deletions cli/src/semgrep/run_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,39 @@ def filter_dependency_aware_rules(
return filtered_rules


def get_reusable_interfile_scan_plan(
engine_type: EngineType,
target_mode_config: TargetModeConfig,
rules_for_core: Sequence[Rule],
scan_plans: Sequence[Plan],
) -> Optional[Plan]:
"""
Reuse the already-built pre-scan SAST plan when it exactly matches the
rule set that will be executed by the interfile engine.

This keeps the optimization narrow and reversible:
- only interfile scans use it
- differential and historical scans keep their existing plan-building path
- any rule-set mismatch falls back to rebuilding the execution plan
"""
if (
not engine_type.is_interfile
or target_mode_config.is_pro_diff_scan
or target_mode_config.is_historical_scan
or not scan_plans
):
return None

sast_plan = scan_plans[0]
pre_scan_rule_ids = tuple(rule.id for rule in sast_plan.rules)
core_rule_ids = tuple(rule.id for rule in rules_for_core)

if pre_scan_rule_ids != core_rule_ids:
return None

return sast_plan


# This runs semgrep-core (and also handles SCA and join rules)
@tracing.trace()
def run_rules(
Expand Down Expand Up @@ -353,6 +386,12 @@ def run_rules(
with_code_rules=with_code_rules,
with_supply_chain=with_supply_chain,
)
reusable_interfile_plan = get_reusable_interfile_scan_plan(
engine_type,
target_mode_config,
rest_of_the_rules,
plans,
)

# Dispatching to semgrep-core!
(
Expand All @@ -373,6 +412,7 @@ def run_rules(
resolved_subprojects,
opengrep_ignore_pattern=opengrep_ignore_pattern,
bypass_includes_excludes_for_files=bypass_includes_excludes_for_files,
precomputed_plan=reusable_interfile_plan,
inline_metavariables=inline_metavariables,
max_match_per_file=max_match_per_file,
allow_rule_timeout_control=allow_rule_timeout_control,
Expand Down
169 changes: 169 additions & 0 deletions cli/tests/default/unit/test_interfile_scan_plan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
from pathlib import Path

import pytest

import semgrep.semgrep_interfaces.semgrep_output_v1 as out
from semgrep.core_runner import CoreRunner
from semgrep.core_targets_plan import Plan
from semgrep.core_targets_plan import Task
from semgrep.engine import EngineType
from semgrep.rule import Rule
from semgrep.run_scan import get_reusable_interfile_scan_plan
from semgrep.run_scan import run_rules
from semgrep.target_mode import TargetModeConfig


def make_rule(rule_id: str) -> Rule:
return Rule(
{
"id": rule_id,
"languages": ["python"],
"message": "test message",
"severity": "INFO",
"pattern": "print(...)",
}
)


def make_sast_plan(rule: Rule, path: str = "foo.py") -> Plan:
return Plan(
[
Task(
path=path,
analyzer=rule.languages[0],
products=(out.Product(out.SAST()),),
rule_nums=(0,),
)
],
[rule],
product=out.Product(out.SAST()),
sca_subprojects={},
)


@pytest.mark.quick
def test_get_reusable_interfile_scan_plan_returns_matching_sast_plan() -> None:
rule = make_rule("rule.matching")
sast_plan = make_sast_plan(rule)

reusable_plan = get_reusable_interfile_scan_plan(
EngineType.PRO_INTERFILE,
TargetModeConfig.whole_scan(),
[rule],
[sast_plan],
)

assert reusable_plan is sast_plan


@pytest.mark.quick
def test_get_reusable_interfile_scan_plan_rejects_diff_scans_and_rule_mismatches() -> None:
matching_rule = make_rule("rule.matching")
other_rule = make_rule("rule.other")
sast_plan = make_sast_plan(matching_rule)

diff_plan = get_reusable_interfile_scan_plan(
EngineType.PRO_INTERFILE,
TargetModeConfig.pro_diff_scan(frozenset({Path("foo.py")}), diff_depth=2),
[matching_rule],
[sast_plan],
)
mismatch_plan = get_reusable_interfile_scan_plan(
EngineType.PRO_INTERFILE,
TargetModeConfig.whole_scan(),
[other_rule],
[sast_plan],
)

assert diff_plan is None
assert mismatch_plan is None


@pytest.mark.quick
def test_run_rules_passes_reusable_interfile_plan_to_core_runner(mocker) -> None:
rule = make_rule("rule.matching")
sast_plan = make_sast_plan(rule)
sca_plan = Plan([], [], product=out.Product(out.SCA()), sca_subprojects={})
mock_output_extra = mocker.Mock()
mock_output_extra.all_targets = set()

core_runner = mocker.Mock()
core_runner.invoke_semgrep_core.return_value = ({}, [], mock_output_extra)
output_handler = mocker.Mock()
target_manager = mocker.Mock()

mocker.patch(
"semgrep.run_scan.scan_report.print_scan_status",
return_value=[sast_plan, sca_plan],
)

run_rules(
[rule],
target_manager,
core_runner,
output_handler,
dump_command_for_core=False,
time_flag=False,
matching_explanations=False,
engine_type=EngineType.PRO_INTERFILE,
strict=False,
target_mode_config=TargetModeConfig.whole_scan(),
)

assert (
core_runner.invoke_semgrep_core.call_args.kwargs["precomputed_plan"]
is sast_plan
)


@pytest.mark.quick
def test_core_runner_uses_precomputed_plan_without_replanning(mocker, tmp_path) -> None:
rule = make_rule("rule.matching")
precomputed_plan = make_sast_plan(rule)
mock_state = mocker.Mock()
mock_state.env.user_data_folder = tmp_path
mock_state.get_cli_ux_flavor.return_value = object()
mock_state.metrics = mocker.Mock()
mock_state.terminal.is_debug = False

mocker.patch("semgrep.core_runner.get_state", return_value=mock_state)
mocker.patch(
"semgrep.engine.EngineType.get_binary_path",
return_value=Path("/tmp/opengrep-core"),
)
mocker.patch.object(
CoreRunner,
"plan_core_run",
side_effect=AssertionError("plan_core_run should not be called"),
)

runner = CoreRunner(
jobs=1,
engine_type=EngineType.OSS,
timeout=1,
max_memory=0,
timeout_threshold=0,
interfile_timeout=0,
capture_stderr=False,
optimizations="none",
allow_untrusted_validators=False,
)

with pytest.raises(SystemExit) as excinfo:
runner._run_rules_direct_to_semgrep_core_helper(
[rule],
mocker.Mock(),
dump_command_for_core=True,
time_flag=False,
matching_explanations=False,
engine=EngineType.OSS,
strict=False,
run_secrets=False,
disable_secrets_validation=False,
target_mode_config=TargetModeConfig.whole_scan(),
sca_subprojects={},
opengrep_ignore_pattern=None,
precomputed_plan=precomputed_plan,
)

assert excinfo.value.code == 0
Loading