Skip to content

Commit 3c15ebc

Browse files
reimport: prep for Pro overrides
1 parent edc34bb commit 3c15ebc

2 files changed

Lines changed: 56 additions & 62 deletions

File tree

dojo/importers/base_importer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,7 @@ def determine_process_method(
271271
def determine_deduplication_algorithm(self) -> str:
272272
"""
273273
Determines what dedupe algorithm to use for the Test being processed.
274+
Overridden in Pro.
274275
:return: A string representing the dedupe algorithm to use.
275276
"""
276277
return self.test.deduplication_algorithm

dojo/importers/default_reimporter.py

Lines changed: 55 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,8 @@ def get_reimport_match_candidates_for_batch(
170170
can override candidate retrieval without copying the full `process_findings()`
171171
implementation.
172172
173+
Is overridden in Pro.
174+
173175
Returns:
174176
(candidates_by_hash, candidates_by_uid, candidates_by_key)
175177
@@ -201,6 +203,51 @@ def get_reimport_match_candidates_for_batch(
201203

202204
return candidates_by_hash, candidates_by_uid, candidates_by_key
203205

206+
def add_new_finding_to_candidates(
207+
self,
208+
finding: Finding,
209+
candidates_by_hash: dict,
210+
candidates_by_uid: dict,
211+
candidates_by_key: dict,
212+
) -> None:
213+
"""
214+
Add a newly created finding to candidate dictionaries for subsequent findings in the same batch.
215+
216+
This allows duplicates within the same scan report to be detected even when they're processed
217+
in the same batch. When a new finding is created (no match found), it is added to the candidate
218+
dictionaries so that subsequent findings in the same batch can match against it.
219+
220+
This is intentionally a separate method so downstream editions (e.g. Dojo Pro)
221+
can override candidate addition logic without copying the full `process_findings()`
222+
implementation.
223+
224+
Args:
225+
finding: The newly created finding to add to candidates
226+
candidates_by_hash: Dictionary mapping hash_code to list of findings (modified in-place)
227+
candidates_by_uid: Dictionary mapping unique_id_from_tool to list of findings (modified in-place)
228+
candidates_by_key: Dictionary mapping (title_lower, severity) to list of findings (modified in-place)
229+
230+
"""
231+
if not finding:
232+
return
233+
234+
if finding.hash_code:
235+
candidates_by_hash.setdefault(finding.hash_code, []).append(finding)
236+
deduplicationLogger.debug(
237+
f"Added finding {finding.id} (hash_code: {finding.hash_code}) to candidates for next findings in this report",
238+
)
239+
if finding.unique_id_from_tool:
240+
candidates_by_uid.setdefault(finding.unique_id_from_tool, []).append(finding)
241+
deduplicationLogger.debug(
242+
f"Added finding {finding.id} (unique_id_from_tool: {finding.unique_id_from_tool}) to candidates for next findings in this report",
243+
)
244+
if finding.title:
245+
legacy_key = (finding.title.lower(), finding.severity)
246+
candidates_by_key.setdefault(legacy_key, []).append(finding)
247+
deduplicationLogger.debug(
248+
f"Added finding {finding.id} (title: {finding.title}, severity: {finding.severity}) to candidates for next findings in this report",
249+
)
250+
204251
def process_findings(
205252
self,
206253
parsed_findings: list[Finding],
@@ -293,7 +340,7 @@ def process_findings(
293340
is_final = is_final_batch and idx == len(batch_findings) - 1
294341

295342
# Match any findings to this new one coming in using pre-fetched candidates
296-
matched_findings = self.match_finding_for_reimport(
343+
matched_findings = self.match_finding_to_candidate_reimport(
297344
unsaved_finding,
298345
candidates_by_hash=candidates_by_hash,
299346
candidates_by_uid=candidates_by_uid,
@@ -325,23 +372,12 @@ def process_findings(
325372
finding = self.process_finding_that_was_not_matched(unsaved_finding)
326373

327374
# Add newly created finding to candidates for subsequent findings in this batch
328-
if finding:
329-
if finding.hash_code:
330-
candidates_by_hash.setdefault(finding.hash_code, []).append(finding)
331-
deduplicationLogger.debug(
332-
f"Added finding {finding.id} (hash_code: {finding.hash_code}) to candidates for next findings in this report",
333-
)
334-
if finding.unique_id_from_tool:
335-
candidates_by_uid.setdefault(finding.unique_id_from_tool, []).append(finding)
336-
deduplicationLogger.debug(
337-
f"Added finding {finding.id} (unique_id_from_tool: {finding.unique_id_from_tool}) to candidates for next findings in this report",
338-
)
339-
if finding.title:
340-
legacy_key = (finding.title.lower(), finding.severity)
341-
candidates_by_key.setdefault(legacy_key, []).append(finding)
342-
deduplicationLogger.debug(
343-
f"Added finding {finding.id} (title: {finding.title}, severity: {finding.severity}) to candidates for next findings in this report",
344-
)
375+
self.add_new_finding_to_candidates(
376+
finding,
377+
candidates_by_hash,
378+
candidates_by_uid,
379+
candidates_by_key,
380+
)
345381

346382
# This condition __appears__ to always be true, but am afraid to remove it
347383
if finding:
@@ -483,50 +519,7 @@ def parse_findings_dynamic_test_type(
483519
logger.debug("REIMPORT_SCAN parser v2: Create parse findings")
484520
return super().parse_findings_dynamic_test_type(scan, parser)
485521

486-
def match_new_finding_to_existing_finding(
487-
self,
488-
unsaved_finding: Finding,
489-
) -> list[Finding]:
490-
"""Matches a single new finding to N existing findings and then returns those matches"""
491-
# This code should match the logic used for deduplication out of the re-import feature.
492-
# See utils.py deduplicate_* functions
493-
deduplicationLogger.debug("return findings bases on algorithm: %s", self.deduplication_algorithm)
494-
if self.deduplication_algorithm == "hash_code":
495-
return Finding.objects.filter(
496-
test=self.test,
497-
hash_code=unsaved_finding.hash_code,
498-
).exclude(hash_code=None).order_by("id")
499-
if self.deduplication_algorithm == "unique_id_from_tool":
500-
deduplicationLogger.debug(f"unique_id_from_tool: {unsaved_finding.unique_id_from_tool}")
501-
return Finding.objects.filter(
502-
test=self.test,
503-
unique_id_from_tool=unsaved_finding.unique_id_from_tool,
504-
).exclude(unique_id_from_tool=None).order_by("id")
505-
if self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
506-
deduplicationLogger.debug(f"unique_id_from_tool: {unsaved_finding.unique_id_from_tool}")
507-
deduplicationLogger.debug(f"hash_code: {unsaved_finding.hash_code}")
508-
query = Finding.objects.filter(
509-
Q(test=self.test),
510-
(Q(hash_code__isnull=False) & Q(hash_code=unsaved_finding.hash_code))
511-
| (Q(unique_id_from_tool__isnull=False) & Q(unique_id_from_tool=unsaved_finding.unique_id_from_tool)),
512-
).order_by("id")
513-
deduplicationLogger.debug(query.query)
514-
return query
515-
if self.deduplication_algorithm == "legacy":
516-
# This is the legacy reimport behavior. Although it's pretty flawed and doesn't match the legacy algorithm for deduplication,
517-
# this is left as is for simplicity.
518-
# Re-writing the legacy deduplication here would be complicated and counter-productive.
519-
# If you have use cases going through this section, you're advised to create a deduplication configuration for your parser
520-
logger.warning("Legacy reimport. In case of issue, you're advised to create a deduplication configuration in order not to go through this section")
521-
return Finding.objects.filter(
522-
title__iexact=unsaved_finding.title,
523-
test=self.test,
524-
severity=unsaved_finding.severity,
525-
numerical_severity=Finding.get_numerical_severity(unsaved_finding.severity)).order_by("id")
526-
logger.error(f'Internal error: unexpected deduplication_algorithm: "{self.deduplication_algorithm}"')
527-
return None
528-
529-
def match_finding_for_reimport(
522+
def match_finding_to_candidate_reimport(
530523
self,
531524
unsaved_finding: Finding,
532525
candidates_by_hash: dict | None = None,

0 commit comments

Comments
 (0)