Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 100 additions & 35 deletions cli/src/semgrep/core_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import tempfile
from datetime import datetime
from pathlib import Path
from weakref import WeakKeyDictionary
from typing import Any
from typing import Callable
from typing import cast
Expand All @@ -19,6 +20,7 @@
from typing import Tuple

from attr import evolve
from attr import frozen
from rich.progress import BarColumn
from rich.progress import Progress
from rich.progress import TaskID
Expand Down Expand Up @@ -79,6 +81,47 @@
import resource


RuleTargetingSignature = Tuple[
Tuple[str, ...],
Tuple[str, ...],
Tuple[str, ...],
str,
str,
]
PlanBundleCacheKey = Tuple[bool, Tuple[RuleTargetingSignature, ...]]


@frozen
class PlanBundle:
tasks: Tuple[Task, ...]
unused_rule_nums: Tuple[int, ...]


# Reuse the bundled rule-to-target plan inside a single CLI invocation. This is
# especially useful when the scan status path warms the bundle before the actual
# engine run, because the interfile scan can then reuse it entirely from memory.
_PLAN_BUNDLE_CACHE: "WeakKeyDictionary[TargetManager, Dict[PlanBundleCacheKey, PlanBundle]]" = WeakKeyDictionary()


def _rule_targeting_signature(rule: Rule) -> RuleTargetingSignature:
return (
tuple(str(language) for language in rule.languages),
tuple(rule.includes),
tuple(rule.excludes),
rule.id,
rule.product.to_json_string(),
)


def _plan_bundle_cache_key(
rules: Sequence[Rule], bypass_includes_excludes_for_files: bool
) -> PlanBundleCacheKey:
return (
bypass_includes_excludes_for_files,
tuple(_rule_targeting_signature(rule) for rule in rules),
)


def setrlimits_preexec_fn() -> None:
"""
Sets stack limit of current running process to the maximum possible
Expand Down Expand Up @@ -731,46 +774,68 @@ def plan_core_run(

Note: this is a list because a target can appear twice (e.g. Java + Generic)
"""
# The range of target_info is (index into rules x product as json)
# Using product as JSON because we want structural equality of products instead of object equality.
target_info: Dict[
Tuple[Path, Language], Tuple[List[int], Set[str]]
] = collections.defaultdict(lambda: (list(), set()))

unused_rules = []

for rule_num, rule in enumerate(rules):
any_target = False
for language in rule.languages:
targets = list(
target_manager.get_files_for_rule(
language, rule.includes, rule.excludes, rule.id, rule.product,
bypass_includes_excludes_for_files=bypass_includes_excludes_for_files,
cache_key = _plan_bundle_cache_key(
rules, bypass_includes_excludes_for_files
)
bundle_cache = _PLAN_BUNDLE_CACHE.setdefault(target_manager, {})
bundle = bundle_cache.get(cache_key)

if bundle is None:
# The range of target_info is (index into rules x product as json)
# Using product as JSON because we want structural equality of products instead of object equality.
target_info: Dict[
Tuple[Path, Language], Tuple[List[int], Set[str]]
] = collections.defaultdict(lambda: (list(), set()))

unused_rule_nums: List[int] = []

for rule_num, rule in enumerate(rules):
any_target = False
for language in rule.languages:
targets = list(
target_manager.get_files_for_rule(
language,
rule.includes,
rule.excludes,
rule.id,
rule.product,
bypass_includes_excludes_for_files=bypass_includes_excludes_for_files,
)
)
)
any_target = any_target or len(targets) > 0
any_target = any_target or len(targets) > 0

for target in targets:
rules_nums, products = target_info[target, language]
rules_nums.append(rule_num)
products.add(rule.product.to_json_string())

if not any_target:
unused_rule_nums.append(rule_num)

bundle = PlanBundle(
tasks=tuple(
Task(
path=target,
analyzer=language,
products=tuple(
out.Product.from_json_string(x) for x in products
),
# tuple conversion makes rule_nums hashable, so usable as cache key
rule_nums=tuple(rule_nums),
)
for ((target, language), (rule_nums, products)) in target_info.items()
),
unused_rule_nums=tuple(unused_rule_nums),
)
bundle_cache[cache_key] = bundle

for target in targets:
if all_targets is not None:
all_targets.add(target)
rules_nums, products = target_info[target, language]
rules_nums.append(rule_num)
products.add(rule.product.to_json_string())
if all_targets is not None:
all_targets.update(Path(task.path) for task in bundle.tasks)

if not any_target:
unused_rules.append(rule)
unused_rules = [rules[rule_num] for rule_num in bundle.unused_rule_nums]

return Plan(
[
Task(
path=target,
analyzer=language,
products=tuple(out.Product.from_json_string(x) for x in products),
# tuple conversion makes rule_nums hashable, so usable as cache key
rule_nums=tuple(rule_nums),
)
for ((target, language), (rule_nums, products)) in target_info.items()
],
list(bundle.tasks),
rules,
product=product,
sca_subprojects=sca_subprojects,
Expand Down
1 change: 1 addition & 0 deletions cli/src/semgrep/run_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ def run_rules(
cli_ux=cli_ux,
with_code_rules=with_code_rules,
with_supply_chain=with_supply_chain,
bypass_includes_excludes_for_files=bypass_includes_excludes_for_files,
)

# Dispatching to semgrep-core!
Expand Down
168 changes: 168 additions & 0 deletions cli/tests/default/unit/test_core_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
from types import SimpleNamespace

import pytest

from semgrep.core_runner import CoreRunner
from semgrep.engine import EngineType
from semgrep.rule import Rule
from semgrep.run_scan import run_rules
from semgrep.target_manager import SAST_PRODUCT
from semgrep.target_manager import TargetManager
from semgrep.target_mode import TargetModeConfig


def make_rule() -> Rule:
return Rule(
{
"id": "test.rule",
"pattern": "sink(...)",
"languages": ["python"],
"message": "test",
"severity": "INFO",
}
)


@pytest.mark.quick
def test_plan_core_run_reuses_bundle_from_memory(tmp_path, monkeypatch):
(tmp_path / "app.py").write_text("print('hello')\n")
target_manager = TargetManager([str(tmp_path)])
rules = [make_rule()]

original_get_files_for_rule = TargetManager.get_files_for_rule
get_files_for_rule_calls = 0

def counting_get_files_for_rule(self, *args, **kwargs):
nonlocal get_files_for_rule_calls
get_files_for_rule_calls += 1
return original_get_files_for_rule(self, *args, **kwargs)

monkeypatch.setattr(TargetManager, "get_files_for_rule", counting_get_files_for_rule)

first_plan = CoreRunner.plan_core_run(
rules,
target_manager,
sca_subprojects={},
)
second_plan = CoreRunner.plan_core_run(
rules,
target_manager,
sca_subprojects={},
)

assert first_plan.target_mappings == second_plan.target_mappings
assert get_files_for_rule_calls == 1


@pytest.mark.quick
def test_plan_core_run_cache_key_includes_explicit_file_bypass(tmp_path):
app_file = tmp_path / "app.py"
app_file.write_text("print('hello')\n")
target_manager = TargetManager(
[str(app_file)],
excludes={SAST_PRODUCT: ["app.py"]},
)
rules = [make_rule()]

plan_with_bypass = CoreRunner.plan_core_run(
rules,
target_manager,
sca_subprojects={},
bypass_includes_excludes_for_files=True,
)
plan_without_bypass = CoreRunner.plan_core_run(
rules,
target_manager,
sca_subprojects={},
bypass_includes_excludes_for_files=False,
)

assert plan_with_bypass.num_targets == 1
assert plan_without_bypass.num_targets == 0


@pytest.mark.quick
def test_run_rules_reuses_pre_scan_bundle_for_interfile_scan(tmp_path, monkeypatch):
(tmp_path / "app.py").write_text("print('hello')\n")
target_manager = TargetManager([str(tmp_path)])
rules = [make_rule()]

original_get_files_for_rule = TargetManager.get_files_for_rule
get_files_for_rule_calls = 0

def counting_get_files_for_rule(self, *args, **kwargs):
nonlocal get_files_for_rule_calls
get_files_for_rule_calls += 1
return original_get_files_for_rule(self, *args, **kwargs)

monkeypatch.setattr(TargetManager, "get_files_for_rule", counting_get_files_for_rule)

def fake_print_scan_status(
rules,
target_manager,
target_mode_config,
sca_subprojects,
dependency_parser_errors,
*,
bypass_includes_excludes_for_files=True,
**_,
):
del target_mode_config, dependency_parser_errors
return [
CoreRunner.plan_core_run(
list(rules),
target_manager,
sca_subprojects=sca_subprojects,
bypass_includes_excludes_for_files=bypass_includes_excludes_for_files,
)
]

monkeypatch.setattr(
"semgrep.scan_report.print_scan_status",
fake_print_scan_status,
)

class FakeCoreRunner:
def invoke_semgrep_core(
self,
target_manager,
rules,
_dump_command_for_core,
_time_flag,
_matching_explanations,
_engine_type,
_strict,
_run_secrets,
_disable_secrets_validation,
target_mode_config,
resolved_subprojects,
*,
bypass_includes_excludes_for_files=True,
**_,
):
assert not target_mode_config.is_pro_diff_scan
plan = CoreRunner.plan_core_run(
list(rules),
target_manager,
sca_subprojects=resolved_subprojects,
bypass_includes_excludes_for_files=bypass_includes_excludes_for_files,
)
assert plan.num_targets == 1
return {}, [], SimpleNamespace()

run_rules(
rules,
target_manager,
FakeCoreRunner(),
SimpleNamespace(handle_semgrep_errors=lambda errors: None),
dump_command_for_core=False,
time_flag=False,
matching_explanations=False,
engine_type=EngineType.PRO_INTERFILE,
strict=False,
run_secrets=False,
disable_secrets_validation=False,
target_mode_config=TargetModeConfig.whole_scan(),
)

assert get_files_for_rule_calls == 1
Loading