From 69549dfcb7f8ff11bcf730922be03720aa33d24e Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Fri, 12 Dec 2025 17:03:47 +0100 Subject: [PATCH 01/26] fix logger NoneType --- dojo/models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dojo/models.py b/dojo/models.py index 282a8c4d667..e547940dd0e 100644 --- a/dojo/models.py +++ b/dojo/models.py @@ -3505,11 +3505,12 @@ def set_hash_code(self, dedupe_option): # Finding.save is called once from serializers.py with dedupe_option=False because the finding is not ready yet, for example the endpoints are not built # It is then called a second time with dedupe_option defaulted to true; now we can compute the hash_code and run the deduplication elif dedupe_option: + finding_id = self.id if self.id is not None else "unsaved" if self.hash_code is not None: - deduplicationLogger.debug("Hash_code already computed for finding %i", self.id) + deduplicationLogger.debug("Hash_code already computed for finding: %s", finding_id) else: self.hash_code = self.compute_hash_code() - deduplicationLogger.debug("Hash_code computed for finding %i: %s", self.id, self.hash_code) + deduplicationLogger.debug("Hash_code computed for finding: %s: %s", finding_id, self.hash_code) class FindingAdmin(admin.ModelAdmin): From 0ab7a50fd93b192a59f798913401f6ca672c67a8 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 13 Dec 2025 12:18:11 +0100 Subject: [PATCH 02/26] reimport: use batch matching --- dojo/finding/deduplication.py | 175 ++++++++++-- dojo/importers/default_reimporter.py | 287 ++++++++++++++----- dojo/management/commands/list_top_tests.py | 111 ++++++++ dojo/settings/settings.dist.py | 3 + unittests/test_import_reimport.py | 317 ++++++++++++++++++++- unittests/test_importers_performance.py | 130 ++++++--- 6 files changed, 872 insertions(+), 151 deletions(-) create mode 100644 dojo/management/commands/list_top_tests.py diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py index 7f334236dbf..b51c791c199 100644 --- a/dojo/finding/deduplication.py +++ b/dojo/finding/deduplication.py @@ -225,59 +225,122 @@ def are_endpoints_duplicates(new_finding, to_duplicate_finding): return False -def build_dedupe_scope_queryset(test): - scope_on_engagement = test.engagement.deduplication_on_engagement - if scope_on_engagement: - scope_q = Q(test__engagement=test.engagement) - else: - # Product scope limited to current product, but exclude engagements that opted into engagement-scoped dedupe - scope_q = Q(test__engagement__product=test.engagement.product) & ( - Q(test__engagement=test.engagement) - | Q(test__engagement__deduplication_on_engagement=False) - ) +def build_candidate_scope_queryset(test, mode="deduplication", service=None): + """ + Build a queryset for candidate finding. + + Args: + test: The test to scope from + mode: "deduplication" (can match across tests) or "reimport" (same test only) + service: Optional service filter (for deduplication mode, not used for reimport since service is in hash) + + """ + if mode == "reimport": + # For reimport, only filter by test. Service filtering is not needed because + # service is included in hash_code calculation (HASH_CODE_FIELDS_ALWAYS = ["service"]), + # so matching by hash_code automatically ensures correct service match. + queryset = Finding.objects.filter(test=test) + else: # deduplication mode + scope_on_engagement = test.engagement.deduplication_on_engagement + if scope_on_engagement: + scope_q = Q(test__engagement=test.engagement) + else: + # Product scope limited to current product, but exclude engagements that opted into engagement-scoped dedupe + scope_q = Q(test__engagement__product=test.engagement.product) & ( + Q(test__engagement=test.engagement) + | Q(test__engagement__deduplication_on_engagement=False) + ) + queryset = Finding.objects.filter(scope_q) return ( - Finding.objects.filter(scope_q) + queryset .select_related("test", "test__engagement", "test__test_type") .prefetch_related("endpoints") ) -def find_candidates_for_deduplication_hash(test, findings): - base_queryset = build_dedupe_scope_queryset(test) +# TODO: remove this if that doesn't affect Pro +def build_dedupe_scope_queryset(test): + """ + Legacy function name for backward compatibility. + Use build_candidate_scope_queryset() instead. + """ + return build_candidate_scope_queryset(test, mode="deduplication") + + +def find_candidates_for_deduplication_hash(test, findings, mode="deduplication", service=None): + """ + Find candidates by hash_code. Works for both deduplication and reimport. + + Args: + test: The test to scope from + findings: List of findings to find candidates for + mode: "deduplication" or "reimport" + service: Optional service filter (for deduplication mode, not used for reimport since service is in hash) + + """ + base_queryset = build_candidate_scope_queryset(test, mode=mode, service=service) hash_codes = {f.hash_code for f in findings if getattr(f, "hash_code", None) is not None} if not hash_codes: return {} - existing_qs = ( - base_queryset.filter(hash_code__in=hash_codes) - .exclude(hash_code=None) - .exclude(duplicate=True) - .order_by("id") - ) + + existing_qs = base_queryset.filter(hash_code__in=hash_codes).exclude(hash_code=None) + if mode == "deduplication": + existing_qs = existing_qs.exclude(duplicate=True) + existing_qs = existing_qs.order_by("id") + existing_by_hash = {} for ef in existing_qs: existing_by_hash.setdefault(ef.hash_code, []).append(ef) - deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes") + + log_msg = "for reimport" if mode == "reimport" else "" + deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes {log_msg}") return existing_by_hash -def find_candidates_for_deduplication_unique_id(test, findings): - base_queryset = build_dedupe_scope_queryset(test) +def find_candidates_for_deduplication_unique_id(test, findings, mode="deduplication", service=None): + """ + Find candidates by unique_id_from_tool. Works for both deduplication and reimport. + + Args: + test: The test to scope from + findings: List of findings to find candidates for + mode: "deduplication" or "reimport" + service: Optional service filter (for deduplication mode, not used for reimport since service is in hash) + + """ + base_queryset = build_candidate_scope_queryset(test, mode=mode, service=service) unique_ids = {f.unique_id_from_tool for f in findings if getattr(f, "unique_id_from_tool", None) is not None} if not unique_ids: return {} - existing_qs = base_queryset.filter(unique_id_from_tool__in=unique_ids).exclude(unique_id_from_tool=None).exclude(duplicate=True).order_by("id") + + existing_qs = base_queryset.filter(unique_id_from_tool__in=unique_ids).exclude(unique_id_from_tool=None) + if mode == "deduplication": + existing_qs = existing_qs.exclude(duplicate=True) # unique_id_from_tool can only apply to the same test_type because it is parser dependent - existing_qs = existing_qs.filter(test__test_type=test.test_type) + existing_qs = existing_qs.filter(test__test_type=test.test_type).order_by("id") + existing_by_uid = {} for ef in existing_qs: existing_by_uid.setdefault(ef.unique_id_from_tool, []).append(ef) - deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs") + + log_msg = "for reimport" if mode == "reimport" else "" + deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs {log_msg}") return existing_by_uid -def find_candidates_for_deduplication_uid_or_hash(test, findings): - base_queryset = build_dedupe_scope_queryset(test) +def find_candidates_for_deduplication_uid_or_hash(test, findings, mode="deduplication", service=None): + """ + Find candidates by unique_id_from_tool or hash_code. Works for both deduplication and reimport. + + Args: + test: The test to scope from + findings: List of findings to find candidates for + mode: "deduplication" or "reimport" + service: Optional service filter (for deduplication mode, not used for reimport since service is in hash) + + """ + base_queryset = build_candidate_scope_queryset(test, mode=mode, service=service) hash_codes = {f.hash_code for f in findings if getattr(f, "hash_code", None) is not None} unique_ids = {f.unique_id_from_tool for f in findings if getattr(f, "unique_id_from_tool", None) is not None} if not hash_codes and not unique_ids: @@ -291,7 +354,11 @@ def find_candidates_for_deduplication_uid_or_hash(test, findings): uid_q = Q(unique_id_from_tool__isnull=False, unique_id_from_tool__in=unique_ids) & Q(test__test_type=test.test_type) cond |= uid_q - existing_qs = base_queryset.filter(cond).exclude(duplicate=True).order_by("id") + existing_qs = base_queryset.filter(cond) + if mode == "deduplication": + # reimport matching will match against duplicates, import/deduplication doesn't. + existing_qs = existing_qs.exclude(duplicate=True) + existing_qs = existing_qs.order_by("id") existing_by_hash = {} existing_by_uid = {} @@ -300,8 +367,10 @@ def find_candidates_for_deduplication_uid_or_hash(test, findings): existing_by_hash.setdefault(ef.hash_code, []).append(ef) if ef.unique_id_from_tool is not None: existing_by_uid.setdefault(ef.unique_id_from_tool, []).append(ef) - deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs") - deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes") + + log_msg = "for reimport" if mode == "reimport" else "" + deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs {log_msg}") + deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes {log_msg}") return existing_by_uid, existing_by_hash @@ -328,6 +397,52 @@ def find_candidates_for_deduplication_legacy(test, findings): return by_title, by_cwe +# TODO: should we align this with deduplication? +def find_candidates_for_reimport_legacy(test, findings, service=None): + """ + Find all existing findings in the test that match any of the given findings by title and severity. + Used for batch reimport to avoid 1+N query problem. + Legacy reimport matches by title (case-insensitive), severity, and numerical_severity. + Note: This function is kept separate because legacy reimport has fundamentally different matching logic + than legacy deduplication (title+severity vs title+CWE). + Note: service parameter is kept for backward compatibility but not used since service is in hash_code. + """ + base_queryset = build_candidate_scope_queryset(test, mode="reimport", service=None) + + # Collect all unique title/severity combinations + title_severity_pairs = set() + for finding in findings: + if finding.title: + title_severity_pairs.add(( + finding.title.lower(), # Case-insensitive matching + finding.severity, + Finding.get_numerical_severity(finding.severity), + )) + + if not title_severity_pairs: + return {} + + # Build query to find all matching findings + conditions = Q() + for title_lower, severity, numerical_severity in title_severity_pairs: + conditions |= ( + Q(title__iexact=title_lower) & + Q(severity=severity) & + Q(numerical_severity=numerical_severity) + ) + + existing_qs = base_queryset.filter(conditions).order_by("id") + + # Build dictionary keyed by (title_lower, severity) for quick lookup + existing_by_key = {} + for ef in existing_qs: + key = (ef.title.lower(), ef.severity) + existing_by_key.setdefault(key, []).append(ef) + + deduplicationLogger.debug(f"Found {sum(len(v) for v in existing_by_key.values())} existing findings by legacy matching for reimport") + return existing_by_key + + def _is_candidate_older(new_finding, candidate): # Ensure the newer finding is marked as duplicate of the older finding is_older = candidate.id < new_finding.id diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 10b3ac7148a..2e426ba2fda 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -1,5 +1,6 @@ import logging +from django.conf import settings from django.core.files.uploadedfile import TemporaryUploadedFile from django.core.serializers import serialize from django.db.models.query_utils import Q @@ -7,6 +8,12 @@ import dojo.finding.helper as finding_helper import dojo.jira_link.helper as jira_helper from dojo.decorators import we_want_async +from dojo.finding.deduplication import ( + find_candidates_for_deduplication_hash, + find_candidates_for_deduplication_uid_or_hash, + find_candidates_for_deduplication_unique_id, + find_candidates_for_reimport_legacy, +) from dojo.importers.base_importer import BaseImporter, Parser from dojo.importers.options import ImporterOptions from dojo.models import ( @@ -204,86 +211,141 @@ def process_findings( cleaned_findings.append(sanitized) batch_finding_ids: list[int] = [] - batch_max_size = 1000 - - for idx, unsaved_finding in enumerate(cleaned_findings): - is_final = idx == len(cleaned_findings) - 1 - # Some parsers provide "mitigated" field but do not set timezone (because they are probably not available in the report) - # Finding.mitigated is DateTimeField and it requires timezone - if unsaved_finding.mitigated and not unsaved_finding.mitigated.tzinfo: - unsaved_finding.mitigated = unsaved_finding.mitigated.replace(tzinfo=self.now.tzinfo) - # Override the test if needed - if not hasattr(unsaved_finding, "test"): - unsaved_finding.test = self.test - # Set the service supplied at import time - if self.service is not None: - unsaved_finding.service = self.service - # Clean any endpoints that are on the finding - self.endpoint_manager.clean_unsaved_endpoints(unsaved_finding.unsaved_endpoints) - # Calculate the hash code to be used to identify duplicates - unsaved_finding.hash_code = self.calculate_unsaved_finding_hash_code(unsaved_finding) - deduplicationLogger.debug(f"unsaved finding's hash_code: {unsaved_finding.hash_code}") - # Match any findings to this new one coming in - matched_findings = self.match_new_finding_to_existing_finding(unsaved_finding) - deduplicationLogger.debug(f"found {len(matched_findings)} findings matching with current new finding") - # Determine how to proceed based on whether matches were found or not - if matched_findings: - existing_finding = matched_findings[0] - finding, force_continue = self.process_matched_finding( + # Batch size for deduplication/post-processing (only new findings) + dedupe_batch_max_size = getattr(settings, "IMPORT_REIMPORT_DEDUPE_BATCH_SIZE", 1000) + # Batch size for candidate matching (all findings, before matching) + match_batch_max_size = getattr(settings, "IMPORT_REIMPORT_MATCH_BATCH_SIZE", 1000) + + # Process findings in batches to enable batch candidate fetching + # This avoids the 1+N query problem by fetching all candidates for a batch at once + for batch_start in range(0, len(cleaned_findings), match_batch_max_size): + batch_end = min(batch_start + match_batch_max_size, len(cleaned_findings)) + batch_findings = cleaned_findings[batch_start:batch_end] + is_final_batch = batch_end == len(cleaned_findings) + + logger.debug(f"Processing reimport batch {batch_start}-{batch_end} of {len(cleaned_findings)} findings") + + # Prepare findings in batch: set test, service, calculate hash codes + for unsaved_finding in batch_findings: + # Some parsers provide "mitigated" field but do not set timezone (because they are probably not available in the report) + # Finding.mitigated is DateTimeField and it requires timezone + if unsaved_finding.mitigated and not unsaved_finding.mitigated.tzinfo: + unsaved_finding.mitigated = unsaved_finding.mitigated.replace(tzinfo=self.now.tzinfo) + # Override the test if needed + if not hasattr(unsaved_finding, "test"): + unsaved_finding.test = self.test + # Set the service supplied at import time + if self.service is not None: + unsaved_finding.service = self.service + # Clean any endpoints that are on the finding + self.endpoint_manager.clean_unsaved_endpoints(unsaved_finding.unsaved_endpoints) + # Calculate the hash code to be used to identify duplicates + unsaved_finding.hash_code = self.calculate_unsaved_finding_hash_code(unsaved_finding) + deduplicationLogger.debug(f"unsaved finding's hash_code: {unsaved_finding.hash_code}") + + # Fetch all candidates for this batch at once (batch candidate finding) + candidates_by_hash = None + candidates_by_uid = None + candidates_by_key = None + + if self.deduplication_algorithm == "hash_code": + candidates_by_hash = find_candidates_for_deduplication_hash( + self.test, batch_findings, mode="reimport", + ) + elif self.deduplication_algorithm == "unique_id_from_tool": + candidates_by_uid = find_candidates_for_deduplication_unique_id( + self.test, batch_findings, mode="reimport", + ) + elif self.deduplication_algorithm == "unique_id_from_tool_or_hash_code": + candidates_by_uid, candidates_by_hash = find_candidates_for_deduplication_uid_or_hash( + self.test, batch_findings, mode="reimport", + ) + elif self.deduplication_algorithm == "legacy": + candidates_by_key = find_candidates_for_reimport_legacy(self.test, batch_findings) + + # Process each finding in the batch using pre-fetched candidates + for idx, unsaved_finding in enumerate(batch_findings): + is_final = is_final_batch and idx == len(batch_findings) - 1 + # Match any findings to this new one coming in using pre-fetched candidates + matched_findings = self.match_finding_for_reimport( unsaved_finding, - existing_finding, + candidates_by_hash=candidates_by_hash, + candidates_by_uid=candidates_by_uid, + candidates_by_key=candidates_by_key, ) - # Determine if we should skip the rest of the loop - if force_continue: - continue - # Update endpoints on the existing finding with those on the new finding - if finding.dynamic_finding: - logger.debug( - "Re-import found an existing dynamic finding for this new " - "finding. Checking the status of endpoints", - ) - self.endpoint_manager.update_endpoint_status( - existing_finding, + deduplicationLogger.debug(f"found {len(matched_findings)} findings matching with current new finding") + # Determine how to proceed based on whether matches were found or not + if matched_findings: + existing_finding = matched_findings[0] + finding, force_continue = self.process_matched_finding( unsaved_finding, - self.user, + existing_finding, ) - else: - finding = self.process_finding_that_was_not_matched(unsaved_finding) - # This condition __appears__ to always be true, but am afraid to remove it - if finding: - # Process the rest of the items on the finding - finding = self.finding_post_processing( - finding, - unsaved_finding, - ) - # all data is already saved on the finding, we only need to trigger post processing in batches - push_to_jira = self.push_to_jira and (not self.findings_groups_enabled or not self.group_by) - batch_finding_ids.append(finding.id) - - # If batch is full or we're at the end, dispatch one batched task - if len(batch_finding_ids) >= batch_max_size or is_final: - finding_ids_batch = list(batch_finding_ids) - batch_finding_ids.clear() - if we_want_async(async_user=self.user): - finding_helper.post_process_findings_batch_signature( - finding_ids_batch, - dedupe_option=True, - rules_option=True, - product_grading_option=True, - issue_updater_option=True, - push_to_jira=push_to_jira, - )() - else: - finding_helper.post_process_findings_batch( - finding_ids_batch, - dedupe_option=True, - rules_option=True, - product_grading_option=True, - issue_updater_option=True, - push_to_jira=push_to_jira, + # Determine if we should skip the rest of the loop + if force_continue: + continue + # Update endpoints on the existing finding with those on the new finding + if finding.dynamic_finding: + logger.debug( + "Re-import found an existing dynamic finding for this new " + "finding. Checking the status of endpoints", ) - - # No chord: tasks are dispatched immediately above per batch + self.endpoint_manager.update_endpoint_status( + existing_finding, + unsaved_finding, + self.user, + ) + else: + finding = self.process_finding_that_was_not_matched(unsaved_finding) + # This condition __appears__ to always be true, but am afraid to remove it + if finding: + # Process the rest of the items on the finding + finding = self.finding_post_processing( + finding, + unsaved_finding, + ) + # all data is already saved on the finding, we only need to trigger post processing in batches + push_to_jira = self.push_to_jira and (not self.findings_groups_enabled or not self.group_by) + batch_finding_ids.append(finding.id) + + # Post-processing batches (deduplication, rules, etc.) are separate from matching batches. + # These batches only contain "new" findings that were saved (not matched to existing findings). + # In reimport scenarios, typically most findings match existing ones, so only a small fraction + # of findings in each matching batch become new findings that need deduplication. + # + # We accumulate finding IDs across matching batches rather than dispatching at the end of each + # matching batch. This ensures deduplication batches stay close to the intended batch size + # (e.g., 1000 findings) for optimal bulk operation efficiency, even when only ~10% of findings + # in matching batches are new. If we dispatched at the end of each matching batch, we would + # end up with many small deduplication batches (e.g., ~100 findings each), reducing efficiency. + # + # The two batch types serve different purposes: + # - Matching batches: optimize candidate fetching (solve 1+N query problem) + # - Deduplication batches: optimize bulk operations (larger batches = fewer queries) + # They don't need to be aligned since they optimize different operations. + if len(batch_finding_ids) >= dedupe_batch_max_size or is_final: + finding_ids_batch = list(batch_finding_ids) + batch_finding_ids.clear() + if we_want_async(async_user=self.user): + finding_helper.post_process_findings_batch_signature( + finding_ids_batch, + dedupe_option=True, + rules_option=True, + product_grading_option=True, + issue_updater_option=True, + push_to_jira=push_to_jira, + )() + else: + finding_helper.post_process_findings_batch( + finding_ids_batch, + dedupe_option=True, + rules_option=True, + product_grading_option=True, + issue_updater_option=True, + push_to_jira=push_to_jira, + ) + + # No chord: tasks are dispatched immediately above per batch self.to_mitigate = (set(self.original_items) - set(self.reactivated_items) - set(self.unchanged_items)) # due to #3958 we can have duplicates inside the same report @@ -418,6 +480,81 @@ def match_new_finding_to_existing_finding( logger.error(f'Internal error: unexpected deduplication_algorithm: "{self.deduplication_algorithm}"') return None + def match_finding_for_reimport( + self, + unsaved_finding: Finding, + candidates_by_hash: dict | None = None, + candidates_by_uid: dict | None = None, + candidates_by_key: dict | None = None, + ) -> list[Finding]: + """ + Matches a single new finding to existing findings using pre-fetched candidate dictionaries. + This avoids individual database queries by using batch-fetched candidates. + + Args: + unsaved_finding: The finding to match + candidates_by_hash: Dictionary mapping hash_code to list of findings (for hash_code algorithm) + candidates_by_uid: Dictionary mapping unique_id_from_tool to list of findings (for unique_id algorithms) + candidates_by_key: Dictionary mapping (title_lower, severity) to list of findings (for legacy algorithm) + + Returns: + List of matching findings, ordered by id + + """ + deduplicationLogger.debug("matching finding for reimport using algorithm: %s", self.deduplication_algorithm) + + if self.deduplication_algorithm == "hash_code": + if candidates_by_hash is None: + # Fallback to individual query if candidates not provided + return self.match_new_finding_to_existing_finding(unsaved_finding) + if unsaved_finding.hash_code is None: + return [] + matches = candidates_by_hash.get(unsaved_finding.hash_code, []) + return sorted(matches, key=lambda f: f.id) + + if self.deduplication_algorithm == "unique_id_from_tool": + if candidates_by_uid is None: + # Fallback to individual query if candidates not provided + return self.match_new_finding_to_existing_finding(unsaved_finding) + if unsaved_finding.unique_id_from_tool is None: + return [] + matches = candidates_by_uid.get(unsaved_finding.unique_id_from_tool, []) + return sorted(matches, key=lambda f: f.id) + + if self.deduplication_algorithm == "unique_id_from_tool_or_hash_code": + if candidates_by_hash is None or candidates_by_uid is None: + # Fallback to individual query if candidates not provided + return self.match_new_finding_to_existing_finding(unsaved_finding) + + # Collect matches from both hash_code and unique_id_from_tool + matches_by_id = {} + + if unsaved_finding.hash_code is not None: + hash_matches = candidates_by_hash.get(unsaved_finding.hash_code, []) + for match in hash_matches: + matches_by_id[match.id] = match + + if unsaved_finding.unique_id_from_tool is not None: + uid_matches = candidates_by_uid.get(unsaved_finding.unique_id_from_tool, []) + for match in uid_matches: + matches_by_id[match.id] = match + + matches = list(matches_by_id.values()) + return sorted(matches, key=lambda f: f.id) + + if self.deduplication_algorithm == "legacy": + if candidates_by_key is None: + # Fallback to individual query if candidates not provided + return self.match_new_finding_to_existing_finding(unsaved_finding) + if not unsaved_finding.title: + return [] + key = (unsaved_finding.title.lower(), unsaved_finding.severity) + matches = candidates_by_key.get(key, []) + return sorted(matches, key=lambda f: f.id) + + logger.error(f'Internal error: unexpected deduplication_algorithm: "{self.deduplication_algorithm}"') + return [] + def process_matched_finding( self, unsaved_finding: Finding, diff --git a/dojo/management/commands/list_top_tests.py b/dojo/management/commands/list_top_tests.py new file mode 100644 index 00000000000..839a1dc03fd --- /dev/null +++ b/dojo/management/commands/list_top_tests.py @@ -0,0 +1,111 @@ +from django.core.management.base import BaseCommand +from django.db.models import Count, Q + +from dojo.models import Test + + +class Command(BaseCommand): + help = "List the top 25 tests with the most findings" + + def add_arguments(self, parser): + parser.add_argument( + "--limit", + type=int, + default=25, + help="Number of tests to display (default: 25)", + ) + + def handle(self, *args, **options): + limit = options["limit"] + + # Annotate tests with finding counts + tests = ( + Test.objects.annotate( + total_findings=Count("finding", distinct=True), + active_findings=Count("finding", filter=Q(finding__active=True), distinct=True), + duplicate_findings=Count("finding", filter=Q(finding__duplicate=True), distinct=True), + ) + .filter(total_findings__gt=0) + .select_related("engagement", "engagement__product", "test_type") + .order_by("-total_findings")[:limit] + ) + + if not tests: + self.stdout.write(self.style.WARNING("No tests with findings found.")) + return + + # Calculate column widths + max_product_len = max( + (len(str(test.engagement.product.name)) for test in tests), + default=20, + ) + max_engagement_len = max( + (len(str(test.engagement.name)) for test in tests), + default=20, + ) + max_test_len = max( + (len(str(test.title or test.id)) for test in tests), + default=20, + ) + max_test_type_len = max( + (len(str(test.test_type.name)) for test in tests), + default=20, + ) + max_dedup_algo_len = max( + (len(str(test.deduplication_algorithm)) for test in tests), + default=20, + ) + + # Ensure minimum widths for readability + max_product_len = max(max_product_len, 20) + max_engagement_len = max(max_engagement_len, 20) + max_test_len = max(max_test_len, 20) + max_test_type_len = max(max_test_type_len, 20) + max_dedup_algo_len = max(max_dedup_algo_len, 20) + + # Header + header = ( + f"{'Product':<{max_product_len}} | " + f"{'Engagement':<{max_engagement_len}} | " + f"{'Test':<{max_test_len}} | " + f"{'Test Type':<{max_test_type_len}} | " + f"{'Dedup Algorithm':<{max_dedup_algo_len}} | " + f"{'Total':>8} | " + f"{'Active':>8} | " + f"{'Duplicate':>10}" + ) + separator = "-" * len(header) + + self.stdout.write(self.style.SUCCESS(header)) + self.stdout.write(separator) + + # Data rows + for test in tests: + product_name = str(test.engagement.product.name) + engagement_name = str(test.engagement.name) + test_name = str(test.title or f"Test #{test.id}") + test_type_name = str(test.test_type.name) + dedup_algo = str(test.deduplication_algorithm) + total = test.total_findings + active = test.active_findings + duplicate = test.duplicate_findings + + row = ( + f"{product_name:<{max_product_len}} | " + f"{engagement_name:<{max_engagement_len}} | " + f"{test_name:<{max_test_len}} | " + f"{test_type_name:<{max_test_type_len}} | " + f"{dedup_algo:<{max_dedup_algo_len}} | " + f"{total:>8} | " + f"{active:>8} | " + f"{duplicate:>10}" + ) + self.stdout.write(row) + + # Summary + self.stdout.write(separator) + self.stdout.write( + self.style.SUCCESS( + f"\nDisplayed top {len(tests)} tests with findings.", + ), + ) diff --git a/dojo/settings/settings.dist.py b/dojo/settings/settings.dist.py index 2ef1835b08e..ae96077f07a 100644 --- a/dojo/settings/settings.dist.py +++ b/dojo/settings/settings.dist.py @@ -283,6 +283,8 @@ DD_EDITABLE_MITIGATED_DATA=(bool, False), # new feature that tracks history across multiple reimports for the same test DD_TRACK_IMPORT_HISTORY=(bool, True), + # Batch size for reimport candidate matching (finding existing findings) + DD_IMPORT_REIMPORT_MATCH_BATCH_SIZE=(int, 1000), # Batch size for import/reimport deduplication processing DD_IMPORT_REIMPORT_DEDUPE_BATCH_SIZE=(int, 1000), # Delete Auditlogs older than x month; -1 to keep all logs @@ -1710,6 +1712,7 @@ def saml2_attrib_map_format(din): DISABLE_FINDING_MERGE = env("DD_DISABLE_FINDING_MERGE") TRACK_IMPORT_HISTORY = env("DD_TRACK_IMPORT_HISTORY") +IMPORT_REIMPORT_MATCH_BATCH_SIZE = env("DD_IMPORT_REIMPORT_MATCH_BATCH_SIZE") IMPORT_REIMPORT_DEDUPE_BATCH_SIZE = env("DD_IMPORT_REIMPORT_DEDUPE_BATCH_SIZE") # ------------------------------------------------------------------------------ diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py index 2f71c720e02..dbf206b808d 100644 --- a/unittests/test_import_reimport.py +++ b/unittests/test_import_reimport.py @@ -11,7 +11,7 @@ from django.urls import reverse from django.utils import timezone -from dojo.models import Finding, Test, Test_Type, User +from dojo.models import Engagement, Finding, Product, Product_Type, Test, Test_Type, User from .dojo_test_case import DojoAPITestCase, get_unit_tests_scans_path from .test_utils import assertTestImportModelsCreated @@ -108,6 +108,9 @@ def __init__(self, *args, **kwargs): self.checkmarx_one_two_false_positive = get_unit_tests_scans_path("checkmarx_one") / "two-false-positive.json" self.scan_type_checkmarx_one = "Checkmarx One Scan" + self.bandit_large_file = get_unit_tests_scans_path("bandit") / "many_vulns.json" + self.scan_type_bandit = "Bandit Scan" + # import zap scan, testing: # - import # - active/verifed = True @@ -2033,6 +2036,318 @@ def test_reimport_set_scan_date_parser_sets_date(self): date = findings["results"][0]["date"] self.assertEqual(date, "2006-12-26") + @override_settings( + IMPORT_REIMPORT_DEDUPE_BATCH_SIZE=200, + IMPORT_REIMPORT_MATCH_BATCH_SIZE=200, + ) + def test_batch_reimport_large_bandit_file(self): + """ + Test that batch reimport produces identical results to non-batch mode (which we simulate via large max batch size setting). + + Step 1: Import scan (baseline), assess active count, duplicate count + Step 2: Reimport scan (same test), assess active count, duplicate count + Step 3: Import scan in NEW engagement with batch_size=50, assess active and duplicate equal to step 1 + Step 4: Reimport scan in same new engagement (batch_size=50), assess active and duplicate equal to step 2 + """ + # Step 1: Baseline import (default batch size) + # Create engagement first and set deduplication_on_engagement before import + product_type1, _ = Product_Type.objects.get_or_create(name="PT Bandit Baseline") + product1, _ = Product.objects.get_or_create(name="P Bandit Baseline", prod_type=product_type1) + engagement1 = Engagement.objects.create( + name="E Bandit Baseline", + product=product1, + target_start=timezone.now(), + target_end=timezone.now(), + ) + engagement1.deduplication_on_engagement = True + engagement1.save() + engagement1_id = engagement1.id + + import1 = self.import_scan_with_params( + self.bandit_large_file, + scan_type=self.scan_type_bandit, + engagement=engagement1_id, + product_type_name=None, + product_name=None, + engagement_name=None, + auto_create_context=False, + ) + test1_id = import1["test"] + + # Assess step 1: active count, duplicate count + step1_active = Finding.objects.filter( + test__engagement_id=engagement1_id, + active=True, + duplicate=False, + ).count() + step1_duplicate = Finding.objects.filter( + test__engagement_id=engagement1_id, + duplicate=True, + ).count() + + # Assert step 1 specific counts + self.assertEqual(step1_active, 213, "Step 1 active count") + self.assertEqual(step1_duplicate, 1, "Step 1 duplicate count") + + # Step 2: Reimport scan (same test) + self.reimport_scan_with_params( + test1_id, + self.bandit_large_file, + scan_type=self.scan_type_bandit, + ) + + # Assess step 2: active count, duplicate count + step2_active = Finding.objects.filter( + test__engagement_id=engagement1_id, + active=True, + duplicate=False, + ).count() + step2_duplicate = Finding.objects.filter( + test__engagement_id=engagement1_id, + duplicate=True, + ).count() + + # Assert step 2 specific counts + self.assertEqual(step2_active, 213, "Step 2 active count") + self.assertEqual(step2_duplicate, 1, "Step 2 duplicate count") + + # Step 3: Import scan in NEW engagement with batch_size=50 + with override_settings( + IMPORT_REIMPORT_DEDUPE_BATCH_SIZE=50, + IMPORT_REIMPORT_MATCH_BATCH_SIZE=50, + ): + # Create engagement first and set deduplication_on_engagement before import + product_type2, _ = Product_Type.objects.get_or_create(name="PT Bandit Batch") + product2, _ = Product.objects.get_or_create(name="P Bandit Batch", prod_type=product_type2) + engagement2 = Engagement.objects.create( + name="E Bandit Batch", + product=product2, + target_start=timezone.now(), + target_end=timezone.now(), + ) + engagement2.deduplication_on_engagement = True + engagement2.save() + engagement2_id = engagement2.id + + import2 = self.import_scan_with_params( + self.bandit_large_file, + scan_type=self.scan_type_bandit, + engagement=engagement2_id, + product_type_name=None, + product_name=None, + engagement_name=None, + auto_create_context=False, + ) + test2_id = import2["test"] + + # Assess step 3: active and duplicate should equal step 1 + step3_active = Finding.objects.filter( + test__engagement_id=engagement2_id, + active=True, + duplicate=False, + ).count() + step3_duplicate = Finding.objects.filter( + test__engagement_id=engagement2_id, + duplicate=True, + ).count() + + self.assertEqual( + step3_active, + step1_active, + "Step 3 active count should equal step 1 (baseline import)", + ) + self.assertEqual( + step3_duplicate, + step1_duplicate, + "Step 3 duplicate count should equal step 1 (baseline import)", + ) + + # Step 4: Reimport scan in same new engagement (batch_size=50) + self.reimport_scan_with_params( + test2_id, + self.bandit_large_file, + scan_type=self.scan_type_bandit, + ) + + # Assess step 4: active and duplicate should equal step 2 + step4_active = Finding.objects.filter( + test__engagement_id=engagement2_id, + active=True, + duplicate=False, + ).count() + step4_duplicate = Finding.objects.filter( + test__engagement_id=engagement2_id, + duplicate=True, + ).count() + + self.assertEqual( + step4_active, + step2_active, + "Step 4 active count should equal step 2 (baseline reimport)", + ) + self.assertEqual( + step4_duplicate, + step2_duplicate, + "Step 4 duplicate count should equal step 2 (baseline reimport)", + ) + + def test_batch_deduplication_large_bandit_file(self): + """ + Test that batch deduplication produces identical results to non-batch mode (which we simulate via large max batch size setting). + + Step 1: Import scan (baseline), assess active count, duplicate count + Step 2: Import scan again (same engagement), assess active count, duplicate count + Step 3: Import scan in NEW engagement with batch_size=50, assess active and duplicate equal to step 1 + Step 4: Import scan again in same new engagement (batch_size=50), assess active and duplicate equal to step 2 + """ + # Step 1: Baseline import (default batch size) + # Create engagement first and set deduplication_on_engagement before import + product_type1, _ = Product_Type.objects.get_or_create(name="PT Bandit Baseline Dedupe") + product1, _ = Product.objects.get_or_create(name="P Bandit Baseline Dedupe", prod_type=product_type1) + engagement1 = Engagement.objects.create( + name="E Bandit Baseline Dedupe", + product=product1, + target_start=timezone.now(), + target_end=timezone.now(), + ) + engagement1.deduplication_on_engagement = True + engagement1.save() + engagement1_id = engagement1.id + + self.import_scan_with_params( + self.bandit_large_file, + scan_type=self.scan_type_bandit, + engagement=engagement1_id, + product_type_name=None, + product_name=None, + engagement_name=None, + auto_create_context=False, + ) + + # Assess step 1: active count, duplicate count + step1_active = Finding.objects.filter( + test__engagement_id=engagement1_id, + active=True, + duplicate=False, + ).count() + step1_duplicate = Finding.objects.filter( + test__engagement_id=engagement1_id, + duplicate=True, + ).count() + + # Assert step 1 specific counts + self.assertEqual(step1_active, 213, "Step 1 active count") + self.assertEqual(step1_duplicate, 1, "Step 1 duplicate count") + + # Step 2: Import scan again (same engagement) + self.import_scan_with_params( + self.bandit_large_file, + scan_type=self.scan_type_bandit, + engagement=engagement1_id, + product_type_name=None, + product_name=None, + engagement_name=None, + auto_create_context=False, + ) + + # Assess step 2: active count, duplicate count + step2_active = Finding.objects.filter( + test__engagement_id=engagement1_id, + active=True, + duplicate=False, + ).count() + step2_duplicate = Finding.objects.filter( + test__engagement_id=engagement1_id, + duplicate=True, + ).count() + + # Assert step 2 specific counts + self.assertEqual(step2_active, 213, "Step 2 active count") + self.assertEqual(step2_duplicate, 215, "Step 2 duplicate count") + + # Step 3: Import scan in NEW engagement with batch_size=50 + with override_settings( + IMPORT_REIMPORT_DEDUPE_BATCH_SIZE=50, + IMPORT_REIMPORT_MATCH_BATCH_SIZE=50, + ): + # Create engagement first and set deduplication_on_engagement before import + product_type2, _ = Product_Type.objects.get_or_create(name="PT Bandit Batch Dedupe") + product2, _ = Product.objects.get_or_create(name="P Bandit Batch Dedupe", prod_type=product_type2) + engagement2 = Engagement.objects.create( + name="E Bandit Batch Dedupe", + product=product2, + target_start=timezone.now(), + target_end=timezone.now(), + ) + engagement2.deduplication_on_engagement = True + engagement2.save() + engagement2_id = engagement2.id + + self.import_scan_with_params( + self.bandit_large_file, + scan_type=self.scan_type_bandit, + engagement=engagement2_id, + product_type_name=None, + product_name=None, + engagement_name=None, + auto_create_context=False, + ) + + # Assess step 3: active and duplicate should equal step 1 + step3_active = Finding.objects.filter( + test__engagement_id=engagement2_id, + active=True, + duplicate=False, + ).count() + step3_duplicate = Finding.objects.filter( + test__engagement_id=engagement2_id, + duplicate=True, + ).count() + + self.assertEqual( + step3_active, + step1_active, + "Step 3 active count should equal step 1 (baseline import)", + ) + self.assertEqual( + step3_duplicate, + step1_duplicate, + "Step 3 duplicate count should equal step 1 (baseline import)", + ) + + # Step 4: Import scan again in same new engagement (batch_size=50) + self.import_scan_with_params( + self.bandit_large_file, + scan_type=self.scan_type_bandit, + engagement=engagement2_id, + product_type_name=None, + product_name=None, + engagement_name=None, + auto_create_context=False, + ) + + # Assess step 4: active and duplicate should equal step 2 + step4_active = Finding.objects.filter( + test__engagement_id=engagement2_id, + active=True, + duplicate=False, + ).count() + step4_duplicate = Finding.objects.filter( + test__engagement_id=engagement2_id, + duplicate=True, + ).count() + + self.assertEqual( + step4_active, + step2_active, + "Step 4 active count should equal step 2 (baseline second import)", + ) + self.assertEqual( + step4_duplicate, + step2_duplicate, + "Step 4 duplicate count should equal step 2 (baseline second import)", + ) + class ImportReimportTestUI(DojoAPITestCase, ImportReimportMixin): fixtures = ["dojo_testdata.json"] diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 9da777ccecc..1bf1ab87795 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -33,7 +33,9 @@ STACK_HAWK_SCAN_TYPE = "StackHawk HawkScan" -class TestDojoImporterPerformance(DojoTestCase): +class TestDojoImporterPerformanceBase(DojoTestCase): + + """Base class for performance tests with shared setup and helper methods.""" def setUp(self): super().setUp() @@ -77,35 +79,53 @@ def _assertNumAsyncTask(self, num): ) logger.debug(msg) - def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3): - """ - Log output can be quite large as when the assertNumQueries fails, all queries are printed. - It could be usefule to capture the output in `less`: - ./run-unittest.sh --test-case unittests.test_importers_performance.TestDojoImporterPerformance 2>&1 | less - Then search for `expected` to find the lines where the expected number of queries is printed. - Or you can use `grep` to filter the output: - ./run-unittest.sh --test-case unittests.test_importers_performance.TestDojoImporterPerformance 2>&1 | grep expected -B 10 - """ + def _create_test_objects(self, product_name, engagement_name): + """Helper method to create test product, engagement, lead user, and environment.""" product_type, _created = Product_Type.objects.get_or_create(name="test") product, _created = Product.objects.get_or_create( - name="TestDojoDefaultImporter", + name=product_name, prod_type=product_type, ) engagement, _created = Engagement.objects.get_or_create( - name="Test Create Engagement", + name=engagement_name, product=product, target_start=timezone.now(), target_end=timezone.now(), ) lead, _ = User.objects.get_or_create(username="admin") environment, _ = Development_Environment.objects.get_or_create(name="Development") + return product, engagement, lead, environment + + def _import_reimport_performance( + self, + expected_num_queries1, + expected_num_async_tasks1, + expected_num_queries2, + expected_num_async_tasks2, + expected_num_queries3, + expected_num_async_tasks3, + scan_file1, + scan_file2, + scan_file3, + scan_type, + product_name, + engagement_name, + ): + """ + Test import/reimport/reimport performance with specified scan files and scan type. + Log output can be quite large as when the assertNumQueries fails, all queries are printed. + """ + _, engagement, lead, environment = self._create_test_objects( + product_name, + engagement_name, + ) - # first import the subset which missed one finding and a couple of endpoints on some of the findings + # First import with ( self.subTest("import1"), impersonate(Dojo_User.objects.get(username="admin")), self.assertNumQueries(expected_num_queries1), self._assertNumAsyncTask(expected_num_async_tasks1), - STACK_HAWK_SUBSET_FILENAME.open(encoding="utf-8") as scan, + scan_file1.open(encoding="utf-8") as scan, ): import_options = { "user": lead, @@ -116,7 +136,7 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async "active": True, "verified": True, "sync": True, - "scan_type": STACK_HAWK_SCAN_TYPE, + "scan_type": scan_type, "engagement": engagement, "tags": ["performance-test", "tag-in-param", "go-faster"], "apply_tags_to_findings": True, @@ -124,12 +144,12 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async importer = DefaultImporter(**import_options) test, _, _len_new_findings, _len_closed_findings, _, _, _ = importer.process_scan(scan) - # use reimport with the full report so it add a finding and some endpoints + # Second import (reimport) with ( self.subTest("reimport1"), impersonate(Dojo_User.objects.get(username="admin")), self.assertNumQueries(expected_num_queries2), self._assertNumAsyncTask(expected_num_async_tasks2), - STACK_HAWK_FILENAME.open(encoding="utf-8") as scan, + scan_file2.open(encoding="utf-8") as scan, ): reimport_options = { "test": test, @@ -140,19 +160,19 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async "active": True, "verified": True, "sync": True, - "scan_type": STACK_HAWK_SCAN_TYPE, + "scan_type": scan_type, "tags": ["performance-test-reimport", "reimport-tag-in-param", "reimport-go-faster"], "apply_tags_to_findings": True, } reimporter = DefaultReImporter(**reimport_options) test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan) - # use reimport with the subset again to close a finding and mitigate some endpoints + # Third import (reimport again) with ( self.subTest("reimport2"), impersonate(Dojo_User.objects.get(username="admin")), self.assertNumQueries(expected_num_queries3), self._assertNumAsyncTask(expected_num_async_tasks3), - STACK_HAWK_SUBSET_FILENAME.open(encoding="utf-8") as scan, + scan_file3.open(encoding="utf-8") as scan, ): reimport_options = { "test": test, @@ -163,11 +183,40 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async "active": True, "verified": True, "sync": True, - "scan_type": STACK_HAWK_SCAN_TYPE, + "scan_type": scan_type, } reimporter = DefaultReImporter(**reimport_options) test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan) + +class TestDojoImporterPerformanceSmall(TestDojoImporterPerformanceBase): + + """Performance tests using small sample files (StackHawk, ~6 findings).""" + + def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3): + """ + Log output can be quite large as when the assertNumQueries fails, all queries are printed. + It could be usefule to capture the output in `less`: + ./run-unittest.sh --test-case unittests.test_importers_performance.TestDojoImporterPerformanceSmall 2>&1 | less + Then search for `expected` to find the lines where the expected number of queries is printed. + Or you can use `grep` to filter the output: + ./run-unittest.sh --test-case unittests.test_importers_performance.TestDojoImporterPerformanceSmall 2>&1 | grep expected -B 10 + """ + return super()._import_reimport_performance( + expected_num_queries1, + expected_num_async_tasks1, + expected_num_queries2, + expected_num_async_tasks2, + expected_num_queries3, + expected_num_async_tasks3, + scan_file1=STACK_HAWK_SUBSET_FILENAME, + scan_file2=STACK_HAWK_FILENAME, + scan_file3=STACK_HAWK_SUBSET_FILENAME, + scan_type=STACK_HAWK_SCAN_TYPE, + product_name="TestDojoDefaultImporter", + engagement_name="Test Create Engagement", + ) + @override_settings(ENABLE_AUDITLOG=True, AUDITLOG_TYPE="django-auditlog") def test_import_reimport_reimport_performance_async(self): # Ensure django-auditlog is properly configured for this test @@ -177,9 +226,9 @@ def test_import_reimport_reimport_performance_async(self): self._import_reimport_performance( expected_num_queries1=340, expected_num_async_tasks1=7, - expected_num_queries2=288, + expected_num_queries2=274, expected_num_async_tasks2=18, - expected_num_queries3=175, + expected_num_queries3=162, expected_num_async_tasks3=17, ) @@ -195,9 +244,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self): self._import_reimport_performance( expected_num_queries1=306, expected_num_async_tasks1=7, - expected_num_queries2=281, + expected_num_queries2=267, expected_num_async_tasks2=18, - expected_num_queries3=170, + expected_num_queries3=157, expected_num_async_tasks3=17, ) @@ -219,9 +268,9 @@ def test_import_reimport_reimport_performance_no_async(self): self._import_reimport_performance( expected_num_queries1=345, expected_num_async_tasks1=6, - expected_num_queries2=293, + expected_num_queries2=279, expected_num_async_tasks2=17, - expected_num_queries3=180, + expected_num_queries3=167, expected_num_async_tasks3=16, ) @@ -241,9 +290,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): self._import_reimport_performance( expected_num_queries1=311, expected_num_async_tasks1=6, - expected_num_queries2=286, + expected_num_queries2=272, expected_num_async_tasks2=17, - expected_num_queries3=175, + expected_num_queries3=162, expected_num_async_tasks3=16, ) @@ -267,9 +316,9 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self self._import_reimport_performance( expected_num_queries1=347, expected_num_async_tasks1=8, - expected_num_queries2=295, + expected_num_queries2=281, expected_num_async_tasks2=19, - expected_num_queries3=182, + expected_num_queries3=169, expected_num_async_tasks3=18, ) @@ -290,9 +339,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr self._import_reimport_performance( expected_num_queries1=313, expected_num_async_tasks1=8, - expected_num_queries2=288, + expected_num_queries2=274, expected_num_async_tasks2=19, - expected_num_queries3=177, + expected_num_queries3=164, expected_num_async_tasks3=18, ) @@ -303,19 +352,10 @@ def _deduplication_performance(self, expected_num_queries1, expected_num_async_t The second import should result in all findings being marked as duplicates. This is different from reimport as we create a new test each time. """ - product_type, _created = Product_Type.objects.get_or_create(name="test") - product, _created = Product.objects.get_or_create( - name="TestDojoDeduplicationPerformance", - prod_type=product_type, - ) - engagement, _created = Engagement.objects.get_or_create( - name="Test Deduplication Performance Engagement", - product=product, - target_start=timezone.now(), - target_end=timezone.now(), + _, engagement, lead, environment = self._create_test_objects( + "TestDojoDeduplicationPerformance", + "Test Deduplication Performance Engagement", ) - lead, _ = User.objects.get_or_create(username="admin") - environment, _ = Development_Environment.objects.get_or_create(name="Development") # First import - all findings should be new with ( From c1f800470b8e2199a7ab78a4de57af45d64d33f2 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 13 Dec 2025 12:38:26 +0100 Subject: [PATCH 03/26] tests: always show all mismatched counts --- unittests/test_import_reimport.py | 118 ++++++++------ unittests/test_importers_performance.py | 200 ++++++++++++++---------- 2 files changed, 189 insertions(+), 129 deletions(-) diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py index dbf206b808d..2ac71c155bb 100644 --- a/unittests/test_import_reimport.py +++ b/unittests/test_import_reimport.py @@ -2086,8 +2086,13 @@ def test_batch_reimport_large_bandit_file(self): ).count() # Assert step 1 specific counts - self.assertEqual(step1_active, 213, "Step 1 active count") - self.assertEqual(step1_duplicate, 1, "Step 1 duplicate count") + # Each assertion is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + with self.subTest(step=1, metric="active"): + self.assertEqual(step1_active, 213, "Step 1 active count") + with self.subTest(step=1, metric="duplicate"): + self.assertEqual(step1_duplicate, 1, "Step 1 duplicate count") # Step 2: Reimport scan (same test) self.reimport_scan_with_params( @@ -2108,8 +2113,13 @@ def test_batch_reimport_large_bandit_file(self): ).count() # Assert step 2 specific counts - self.assertEqual(step2_active, 213, "Step 2 active count") - self.assertEqual(step2_duplicate, 1, "Step 2 duplicate count") + # Each assertion is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + with self.subTest(step=2, metric="active"): + self.assertEqual(step2_active, 213, "Step 2 active count") + with self.subTest(step=2, metric="duplicate"): + self.assertEqual(step2_duplicate, 1, "Step 2 duplicate count") # Step 3: Import scan in NEW engagement with batch_size=50 with override_settings( @@ -2151,16 +2161,21 @@ def test_batch_reimport_large_bandit_file(self): duplicate=True, ).count() - self.assertEqual( - step3_active, - step1_active, - "Step 3 active count should equal step 1 (baseline import)", - ) - self.assertEqual( - step3_duplicate, - step1_duplicate, - "Step 3 duplicate count should equal step 1 (baseline import)", - ) + # Each assertion is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + with self.subTest(step=3, metric="active"): + self.assertEqual( + step3_active, + step1_active, + "Step 3 active count should equal step 1 (baseline import)", + ) + with self.subTest(step=3, metric="duplicate"): + self.assertEqual( + step3_duplicate, + step1_duplicate, + "Step 3 duplicate count should equal step 1 (baseline import)", + ) # Step 4: Reimport scan in same new engagement (batch_size=50) self.reimport_scan_with_params( @@ -2180,16 +2195,21 @@ def test_batch_reimport_large_bandit_file(self): duplicate=True, ).count() - self.assertEqual( - step4_active, - step2_active, - "Step 4 active count should equal step 2 (baseline reimport)", - ) - self.assertEqual( - step4_duplicate, - step2_duplicate, - "Step 4 duplicate count should equal step 2 (baseline reimport)", - ) + # Each assertion is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + with self.subTest(step=4, metric="active"): + self.assertEqual( + step4_active, + step2_active, + "Step 4 active count should equal step 2 (baseline reimport)", + ) + with self.subTest(step=4, metric="duplicate"): + self.assertEqual( + step4_duplicate, + step2_duplicate, + "Step 4 duplicate count should equal step 2 (baseline reimport)", + ) def test_batch_deduplication_large_bandit_file(self): """ @@ -2304,16 +2324,21 @@ def test_batch_deduplication_large_bandit_file(self): duplicate=True, ).count() - self.assertEqual( - step3_active, - step1_active, - "Step 3 active count should equal step 1 (baseline import)", - ) - self.assertEqual( - step3_duplicate, - step1_duplicate, - "Step 3 duplicate count should equal step 1 (baseline import)", - ) + # Each assertion is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + with self.subTest(step=3, metric="active"): + self.assertEqual( + step3_active, + step1_active, + "Step 3 active count should equal step 1 (baseline import)", + ) + with self.subTest(step=3, metric="duplicate"): + self.assertEqual( + step3_duplicate, + step1_duplicate, + "Step 3 duplicate count should equal step 1 (baseline import)", + ) # Step 4: Import scan again in same new engagement (batch_size=50) self.import_scan_with_params( @@ -2337,16 +2362,21 @@ def test_batch_deduplication_large_bandit_file(self): duplicate=True, ).count() - self.assertEqual( - step4_active, - step2_active, - "Step 4 active count should equal step 2 (baseline second import)", - ) - self.assertEqual( - step4_duplicate, - step2_duplicate, - "Step 4 duplicate count should equal step 2 (baseline second import)", - ) + # Each assertion is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + with self.subTest(step=4, metric="active"): + self.assertEqual( + step4_active, + step2_active, + "Step 4 active count should equal step 2 (baseline second import)", + ) + with self.subTest(step=4, metric="duplicate"): + self.assertEqual( + step4_duplicate, + step2_duplicate, + "Step 4 duplicate count should equal step 2 (baseline second import)", + ) class ImportReimportTestUI(DojoAPITestCase, ImportReimportMixin): diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 1bf1ab87795..217ed318ea2 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -121,72 +121,90 @@ def _import_reimport_performance( ) # First import - with ( + # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + # Nested with statements are intentional - each assertion needs its own subTest wrapper. + with ( # noqa: SIM117 self.subTest("import1"), impersonate(Dojo_User.objects.get(username="admin")), - self.assertNumQueries(expected_num_queries1), - self._assertNumAsyncTask(expected_num_async_tasks1), scan_file1.open(encoding="utf-8") as scan, ): - import_options = { - "user": lead, - "lead": lead, - "scan_date": None, - "environment": environment, - "minimum_severity": "Info", - "active": True, - "verified": True, - "sync": True, - "scan_type": scan_type, - "engagement": engagement, - "tags": ["performance-test", "tag-in-param", "go-faster"], - "apply_tags_to_findings": True, - } - importer = DefaultImporter(**import_options) - test, _, _len_new_findings, _len_closed_findings, _, _, _ = importer.process_scan(scan) + with self.subTest(step="import1", metric="queries"): + with self.assertNumQueries(expected_num_queries1): + with self.subTest(step="import1", metric="async_tasks"): + with self._assertNumAsyncTask(expected_num_async_tasks1): + import_options = { + "user": lead, + "lead": lead, + "scan_date": None, + "environment": environment, + "minimum_severity": "Info", + "active": True, + "verified": True, + "sync": True, + "scan_type": scan_type, + "engagement": engagement, + "tags": ["performance-test", "tag-in-param", "go-faster"], + "apply_tags_to_findings": True, + } + importer = DefaultImporter(**import_options) + test, _, _len_new_findings, _len_closed_findings, _, _, _ = importer.process_scan(scan) # Second import (reimport) - with ( + # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + # Nested with statements are intentional - each assertion needs its own subTest wrapper. + with ( # noqa: SIM117 self.subTest("reimport1"), impersonate(Dojo_User.objects.get(username="admin")), - self.assertNumQueries(expected_num_queries2), - self._assertNumAsyncTask(expected_num_async_tasks2), scan_file2.open(encoding="utf-8") as scan, ): - reimport_options = { - "test": test, - "user": lead, - "lead": lead, - "scan_date": None, - "minimum_severity": "Info", - "active": True, - "verified": True, - "sync": True, - "scan_type": scan_type, - "tags": ["performance-test-reimport", "reimport-tag-in-param", "reimport-go-faster"], - "apply_tags_to_findings": True, - } - reimporter = DefaultReImporter(**reimport_options) - test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan) + with self.subTest(step="reimport1", metric="queries"): + with self.assertNumQueries(expected_num_queries2): + with self.subTest(step="reimport1", metric="async_tasks"): + with self._assertNumAsyncTask(expected_num_async_tasks2): + reimport_options = { + "test": test, + "user": lead, + "lead": lead, + "scan_date": None, + "minimum_severity": "Info", + "active": True, + "verified": True, + "sync": True, + "scan_type": scan_type, + "tags": ["performance-test-reimport", "reimport-tag-in-param", "reimport-go-faster"], + "apply_tags_to_findings": True, + } + reimporter = DefaultReImporter(**reimport_options) + test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan) # Third import (reimport again) - with ( + # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + # Nested with statements are intentional - each assertion needs its own subTest wrapper. + with ( # noqa: SIM117 self.subTest("reimport2"), impersonate(Dojo_User.objects.get(username="admin")), - self.assertNumQueries(expected_num_queries3), - self._assertNumAsyncTask(expected_num_async_tasks3), scan_file3.open(encoding="utf-8") as scan, ): - reimport_options = { - "test": test, - "user": lead, - "lead": lead, - "scan_date": None, - "minimum_severity": "Info", - "active": True, - "verified": True, - "sync": True, - "scan_type": scan_type, - } - reimporter = DefaultReImporter(**reimport_options) - test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan) + with self.subTest(step="reimport2", metric="queries"): + with self.assertNumQueries(expected_num_queries3): + with self.subTest(step="reimport2", metric="async_tasks"): + with self._assertNumAsyncTask(expected_num_async_tasks3): + reimport_options = { + "test": test, + "user": lead, + "lead": lead, + "scan_date": None, + "minimum_severity": "Info", + "active": True, + "verified": True, + "sync": True, + "scan_type": scan_type, + } + reimporter = DefaultReImporter(**reimport_options) + test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan) class TestDojoImporterPerformanceSmall(TestDojoImporterPerformanceBase): @@ -358,46 +376,58 @@ def _deduplication_performance(self, expected_num_queries1, expected_num_async_t ) # First import - all findings should be new - with ( + # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + # Nested with statements are intentional - each assertion needs its own subTest wrapper. + with ( # noqa: SIM117 self.subTest("first_import"), impersonate(Dojo_User.objects.get(username="admin")), - self.assertNumQueries(expected_num_queries1), - self._assertNumAsyncTask(expected_num_async_tasks1), STACK_HAWK_FILENAME.open(encoding="utf-8") as scan, ): - import_options = { - "user": lead, - "lead": lead, - "scan_date": None, - "environment": environment, - "minimum_severity": "Info", - "active": True, - "verified": True, - "scan_type": STACK_HAWK_SCAN_TYPE, - "engagement": engagement, - } - importer = DefaultImporter(**import_options) - _, _, len_new_findings1, len_closed_findings1, _, _, _ = importer.process_scan(scan) + with self.subTest(step="first_import", metric="queries"): + with self.assertNumQueries(expected_num_queries1): + with self.subTest(step="first_import", metric="async_tasks"): + with self._assertNumAsyncTask(expected_num_async_tasks1): + import_options = { + "user": lead, + "lead": lead, + "scan_date": None, + "environment": environment, + "minimum_severity": "Info", + "active": True, + "verified": True, + "scan_type": STACK_HAWK_SCAN_TYPE, + "engagement": engagement, + } + importer = DefaultImporter(**import_options) + _, _, len_new_findings1, len_closed_findings1, _, _, _ = importer.process_scan(scan) # Second import - all findings should be duplicates - with ( + # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run. + # This allows us to see all count mismatches in a single test run, making it easier to fix + # all incorrect expected values at once rather than fixing them one at a time. + # Nested with statements are intentional - each assertion needs its own subTest wrapper. + with ( # noqa: SIM117 self.subTest("second_import"), impersonate(Dojo_User.objects.get(username="admin")), - self.assertNumQueries(expected_num_queries2), - self._assertNumAsyncTask(expected_num_async_tasks2), STACK_HAWK_FILENAME.open(encoding="utf-8") as scan, ): - import_options = { - "user": lead, - "lead": lead, - "scan_date": None, - "environment": environment, - "minimum_severity": "Info", - "active": True, - "verified": True, - "scan_type": STACK_HAWK_SCAN_TYPE, - "engagement": engagement, - } - importer = DefaultImporter(**import_options) - _, _, len_new_findings2, len_closed_findings2, _, _, _ = importer.process_scan(scan) + with self.subTest(step="second_import", metric="queries"): + with self.assertNumQueries(expected_num_queries2): + with self.subTest(step="second_import", metric="async_tasks"): + with self._assertNumAsyncTask(expected_num_async_tasks2): + import_options = { + "user": lead, + "lead": lead, + "scan_date": None, + "environment": environment, + "minimum_severity": "Info", + "active": True, + "verified": True, + "scan_type": STACK_HAWK_SCAN_TYPE, + "engagement": engagement, + } + importer = DefaultImporter(**import_options) + _, _, len_new_findings2, len_closed_findings2, _, _, _ = importer.process_scan(scan) # Log the results for analysis logger.debug(f"First import: {len_new_findings1} new findings, {len_closed_findings1} closed findings") From 6e7da13bf7729362715e39491655b5acdcaabe0b Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 13 Dec 2025 13:05:23 +0100 Subject: [PATCH 04/26] reimport: add test for internal duplicates during matching --- unittests/test_import_reimport.py | 112 ++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py index 2ac71c155bb..539f879adae 100644 --- a/unittests/test_import_reimport.py +++ b/unittests/test_import_reimport.py @@ -2378,6 +2378,118 @@ def test_batch_deduplication_large_bandit_file(self): "Step 4 duplicate count should equal step 2 (baseline second import)", ) + @override_settings( + IMPORT_REIMPORT_DEDUPE_BATCH_SIZE=50, + IMPORT_REIMPORT_MATCH_BATCH_SIZE=50, + ) + def test_reimport_with_internal_duplicates_same_batch(self): + """ + Test that duplicates within the same scan report are detected during reimport + even when they're processed in the same batch. + + This test mimics the integration test test_import_path_tests in tests/dedupe_test.py, + which tests the same scenario but uses Selenium UI testing. + + BACKGROUND: + When reimporting a scan report, the system uses batch processing to optimize + database queries. Findings are processed in batches (default: 1000), and for + each batch, candidate matches are fetched from the database once. However, this + creates a bug: if multiple duplicate findings exist within the same scan report + and they're processed in the same batch, they cannot match each other because + the candidate dictionary is built before any findings in the batch are saved. + + THE BUG: + - Finding #1 in batch: No matches found → Creates new finding + - Finding #2 in batch: No matches found (Finding #1 not in DB yet) → Creates duplicate + - Finding #3 in batch: No matches found (Findings #1 and #2 not in DB yet) → Creates duplicate + Result: 3 findings created instead of 1 unique finding + + THIS TEST: + This test reproduces the bug by: + 1. Creating an empty test (no existing findings to match against) + 2. Reimporting dedupe_path_1.json which contains 3 duplicate findings: + - All have same file_path: "/opt/appsecpipeline/source/dojo/cred/views.py" + - All have same line_number: 524 + - All have same test_id: "B110" + - All should have the same hash_code and match each other + 3. Using small batch size (50) to ensure all 3 duplicates are in the same batch + 4. Verifying that only 1 finding is created (not 3) + + EXPECTED BEHAVIOR: + - Total findings: 1 (the 3 duplicates should match to 1 unique finding) + - Active findings: 1 + - Duplicate findings: 0 (duplicates within report should match, not be marked as duplicates) + + CURRENT BUGGY BEHAVIOR: + - Total findings: 3 (all duplicates are created because they can't match each other) + - Active findings: 1 (first one) + - Duplicate findings: 2 (second and third are marked as duplicates of first) + + This test should fail with the current buggy implementation and pass after + implementing intra-batch duplicate tracking. + """ + # Use the dedupe_path_1.json file from integration tests (has 3 duplicate findings) + # Note: This file is in tests/dedupe_scans, not unittests/scans + dedupe_path_file = Path(__file__).parent.parent / "tests" / "dedupe_scans" / "dedupe_path_1.json" + + # Create engagement and test + product_type, _ = Product_Type.objects.get_or_create(name="PT Bandit Internal Dupes") + product, _ = Product.objects.get_or_create(name="P Bandit Internal Dupes", prod_type=product_type) + engagement = Engagement.objects.create( + name="E Bandit Internal Dupes", + product=product, + target_start=timezone.now(), + target_end=timezone.now(), + ) + engagement.deduplication_on_engagement = True + engagement.save() + + # Create an empty test first + # Get or create the test type + test_type, _ = Test_Type.objects.get_or_create(name="Bandit Scan") + test = Test.objects.create( + engagement=engagement, + test_type=test_type, + title="Path Test 1", + target_start=timezone.now(), + target_end=timezone.now(), + ) + test_id = test.id + + # Now reimport the file with duplicates to the empty test + # This tests reimport batch matching where duplicates within the same report need to match each other + reimport_result = self.reimport_scan_with_params( + test_id, + dedupe_path_file, + scan_type="Bandit Scan", + ) + + test_id = reimport_result["test"] + + # Verify findings count + # dedupe_path_1.json has 3 findings that are duplicates (same file_path, line_number, test_id) + # They should all match each other, resulting in 1 unique finding + # However, if intra-batch matching fails, we'll get 3 findings created instead of 1 + total_findings = Finding.objects.filter(test_id=test_id).count() + active_findings = Finding.objects.filter(test_id=test_id, active=True, duplicate=False).count() + duplicate_findings = Finding.objects.filter(test_id=test_id, duplicate=True).count() + + # Expected: 1 active finding (the 3 duplicates should match each other) + # If intra-batch matching fails, we'll get 3 findings instead + with self.subTest(metric="total_findings"): + self.assertEqual(total_findings, 1, + f"Expected 1 total finding (3 duplicates should match to 1), got {total_findings}. " + f"If this is 3, intra-batch duplicates weren't detected.") + + with self.subTest(metric="active_findings"): + self.assertEqual(active_findings, 1, + f"Expected 1 active finding, got {active_findings}. " + f"If this fails, intra-batch duplicates weren't detected.") + + with self.subTest(metric="duplicate_findings"): + self.assertEqual(duplicate_findings, 0, + f"Expected 0 duplicate findings (duplicates within report should match), got {duplicate_findings}") + class ImportReimportTestUI(DojoAPITestCase, ImportReimportMixin): fixtures = ["dojo_testdata.json"] From ae0bce2ae6acd31a231e818bc7faac57217773b5 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 13 Dec 2025 16:51:10 +0100 Subject: [PATCH 05/26] reimport: fix handling of duplicates inside the same report/batch --- dojo/importers/default_reimporter.py | 29 +++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 2e426ba2fda..340f7c27cd0 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -189,8 +189,6 @@ def process_findings( self.reactivated_items = [] self.unchanged_items = [] self.group_names_to_findings_dict = {} - # Progressive batching for chord execution - # No chord: we dispatch per 1000 findings or on the final finding logger.debug(f"starting reimport of {len(parsed_findings) if parsed_findings else 0} items.") logger.debug("STEP 1: looping over findings from the reimported report and trying to match them to existing findings") @@ -244,9 +242,9 @@ def process_findings( deduplicationLogger.debug(f"unsaved finding's hash_code: {unsaved_finding.hash_code}") # Fetch all candidates for this batch at once (batch candidate finding) - candidates_by_hash = None - candidates_by_uid = None - candidates_by_key = None + candidates_by_hash = {} + candidates_by_uid = {} + candidates_by_key = {} if self.deduplication_algorithm == "hash_code": candidates_by_hash = find_candidates_for_deduplication_hash( @@ -266,6 +264,7 @@ def process_findings( # Process each finding in the batch using pre-fetched candidates for idx, unsaved_finding in enumerate(batch_findings): is_final = is_final_batch and idx == len(batch_findings) - 1 + # Match any findings to this new one coming in using pre-fetched candidates matched_findings = self.match_finding_for_reimport( unsaved_finding, @@ -297,6 +296,26 @@ def process_findings( ) else: finding = self.process_finding_that_was_not_matched(unsaved_finding) + + # Add newly created finding to candidates for subsequent findings in this batch + if finding: + if finding.hash_code: + candidates_by_hash.setdefault(finding.hash_code, []).append(finding) + deduplicationLogger.debug( + f"Added finding {finding.id} (hash_code: {finding.hash_code}) to candidates for next findings in this report", + ) + if finding.unique_id_from_tool: + candidates_by_uid.setdefault(finding.unique_id_from_tool, []).append(finding) + deduplicationLogger.debug( + f"Added finding {finding.id} (unique_id_from_tool: {finding.unique_id_from_tool}) to candidates for next findings in this report", + ) + if finding.title: + legacy_key = (finding.title.lower(), finding.severity) + candidates_by_key.setdefault(legacy_key, []).append(finding) + deduplicationLogger.debug( + f"Added finding {finding.id} (title: {finding.title}, severity: {finding.severity}) to candidates for next findings in this report", + ) + # This condition __appears__ to always be true, but am afraid to remove it if finding: # Process the rest of the items on the finding From a427db7c5b5d8a42c25b6e72dc4bf6e8f89b6a21 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 13 Dec 2025 18:24:01 +0100 Subject: [PATCH 06/26] add id --- dojo/management/commands/list_top_tests.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dojo/management/commands/list_top_tests.py b/dojo/management/commands/list_top_tests.py index 839a1dc03fd..7666d707101 100644 --- a/dojo/management/commands/list_top_tests.py +++ b/dojo/management/commands/list_top_tests.py @@ -35,6 +35,10 @@ def handle(self, *args, **options): return # Calculate column widths + max_test_id_len = max( + (len(str(test.id)) for test in tests), + default=8, + ) max_product_len = max( (len(str(test.engagement.product.name)) for test in tests), default=20, @@ -57,6 +61,7 @@ def handle(self, *args, **options): ) # Ensure minimum widths for readability + max_test_id_len = max(max_test_id_len, 8) max_product_len = max(max_product_len, 20) max_engagement_len = max(max_engagement_len, 20) max_test_len = max(max_test_len, 20) @@ -65,6 +70,7 @@ def handle(self, *args, **options): # Header header = ( + f"{'Test ID':<{max_test_id_len}} | " f"{'Product':<{max_product_len}} | " f"{'Engagement':<{max_engagement_len}} | " f"{'Test':<{max_test_len}} | " @@ -81,6 +87,7 @@ def handle(self, *args, **options): # Data rows for test in tests: + test_id = str(test.id) product_name = str(test.engagement.product.name) engagement_name = str(test.engagement.name) test_name = str(test.title or f"Test #{test.id}") @@ -91,6 +98,7 @@ def handle(self, *args, **options): duplicate = test.duplicate_findings row = ( + f"{test_id:<{max_test_id_len}} | " f"{product_name:<{max_product_len}} | " f"{engagement_name:<{max_engagement_len}} | " f"{test_name:<{max_test_len}} | " From c5c7a0d8f9a6cc78548884b6da1a73d99941a2d4 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 13 Dec 2025 18:49:00 +0100 Subject: [PATCH 07/26] reimport: optimize vulnerability_id processing --- dojo/finding/deduplication.py | 2 +- dojo/importers/base_importer.py | 27 ++++++++++++++++- unittests/test_importers_performance.py | 40 ++++++++++++------------- 3 files changed, 47 insertions(+), 22 deletions(-) diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py index b51c791c199..b397add4efd 100644 --- a/dojo/finding/deduplication.py +++ b/dojo/finding/deduplication.py @@ -255,7 +255,7 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None): return ( queryset .select_related("test", "test__engagement", "test__test_type") - .prefetch_related("endpoints") + .prefetch_related("endpoints", "vulnerability_id_set") ) diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index 096161db3fa..ab10f1128ca 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -793,9 +793,34 @@ def process_vulnerability_ids( ) -> Finding: """ Parse the `unsaved_vulnerability_ids` field from findings after they are parsed - to create `Vulnerability_Id` objects with the finding associated correctly + to create `Vulnerability_Id` objects with the finding associated correctly. + Only updates if vulnerability_ids have changed to avoid unnecessary database operations. + + Args: + finding: The finding to process vulnerability IDs for + + Returns: + The finding object + """ if finding.unsaved_vulnerability_ids: + # Only check for changes if the finding has been saved and might have existing vulnerability_ids + # For new findings, we always need to save vulnerability_ids + if finding.pk and finding.vulnerability_id_set.exists(): + # Check if vulnerability_ids have changed before updating + # Get existing vulnerability IDs from the database + existing_vuln_ids = set(finding.vulnerability_ids) if finding.vulnerability_ids else set() + # Normalize the new vulnerability IDs (remove duplicates for comparison) + new_vuln_ids = set(finding.unsaved_vulnerability_ids) + + # Only update if vulnerability IDs have changed + if existing_vuln_ids == new_vuln_ids: + logger.debug( + f"Skipping vulnerability_ids update for finding {finding.id} - " + f"vulnerability_ids unchanged: {sorted(existing_vuln_ids)}", + ) + return finding + # Remove old vulnerability ids - keeping this call only because of flake8 Vulnerability_Id.objects.filter(finding=finding).delete() diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 217ed318ea2..c422140a433 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -244,9 +244,9 @@ def test_import_reimport_reimport_performance_async(self): self._import_reimport_performance( expected_num_queries1=340, expected_num_async_tasks1=7, - expected_num_queries2=274, + expected_num_queries2=275, expected_num_async_tasks2=18, - expected_num_queries3=162, + expected_num_queries3=163, expected_num_async_tasks3=17, ) @@ -262,9 +262,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self): self._import_reimport_performance( expected_num_queries1=306, expected_num_async_tasks1=7, - expected_num_queries2=267, + expected_num_queries2=268, expected_num_async_tasks2=18, - expected_num_queries3=157, + expected_num_queries3=158, expected_num_async_tasks3=17, ) @@ -284,11 +284,11 @@ def test_import_reimport_reimport_performance_no_async(self): testuser.usercontactinfo.block_execution = True testuser.usercontactinfo.save() self._import_reimport_performance( - expected_num_queries1=345, + expected_num_queries1=346, expected_num_async_tasks1=6, - expected_num_queries2=279, + expected_num_queries2=281, expected_num_async_tasks2=17, - expected_num_queries3=167, + expected_num_queries3=169, expected_num_async_tasks3=16, ) @@ -306,11 +306,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): testuser.usercontactinfo.save() self._import_reimport_performance( - expected_num_queries1=311, + expected_num_queries1=312, expected_num_async_tasks1=6, - expected_num_queries2=272, + expected_num_queries2=274, expected_num_async_tasks2=17, - expected_num_queries3=162, + expected_num_queries3=164, expected_num_async_tasks3=16, ) @@ -332,11 +332,11 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self self.system_settings(enable_product_grade=True) self._import_reimport_performance( - expected_num_queries1=347, + expected_num_queries1=348, expected_num_async_tasks1=8, - expected_num_queries2=281, + expected_num_queries2=283, expected_num_async_tasks2=19, - expected_num_queries3=169, + expected_num_queries3=171, expected_num_async_tasks3=18, ) @@ -355,11 +355,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr self.system_settings(enable_product_grade=True) self._import_reimport_performance( - expected_num_queries1=313, + expected_num_queries1=314, expected_num_async_tasks1=8, - expected_num_queries2=274, + expected_num_queries2=276, expected_num_async_tasks2=19, - expected_num_queries3=164, + expected_num_queries3=166, expected_num_async_tasks3=18, ) @@ -519,9 +519,9 @@ def test_deduplication_performance_no_async(self): testuser.usercontactinfo.save() self._deduplication_performance( - expected_num_queries1=316, + expected_num_queries1=317, expected_num_async_tasks1=7, - expected_num_queries2=287, + expected_num_queries2=288, expected_num_async_tasks2=7, ) @@ -539,8 +539,8 @@ def test_deduplication_performance_pghistory_no_async(self): testuser.usercontactinfo.save() self._deduplication_performance( - expected_num_queries1=280, + expected_num_queries1=281, expected_num_async_tasks1=7, - expected_num_queries2=250, + expected_num_queries2=251, expected_num_async_tasks2=7, ) From 1e78c9a73430530c715aec331d040aa6385385a3 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 00:12:15 +0100 Subject: [PATCH 08/26] vulnerability_id processing: remove duplicate delete, fix bug, add tests --- dojo/finding/helper.py | 9 +- dojo/importers/base_importer.py | 54 +++--- dojo/importers/default_reimporter.py | 17 +- ...k_all_fields_different_ids_fabricated.json | 167 ++++++++++++++++++ .../check_all_fields_no_ids_fabricated.json | 122 +++++++++++++ .../check_all_fields_with_ids_fabricated.json | 158 +++++++++++++++++ unittests/test_import_reimport.py | 75 ++++++++ unittests/test_importers_importer.py | 85 ++++++++- 8 files changed, 654 insertions(+), 33 deletions(-) create mode 100644 unittests/scans/anchore_grype/check_all_fields_different_ids_fabricated.json create mode 100644 unittests/scans/anchore_grype/check_all_fields_no_ids_fabricated.json create mode 100644 unittests/scans/anchore_grype/check_all_fields_with_ids_fabricated.json diff --git a/dojo/finding/helper.py b/dojo/finding/helper.py index 277609d3153..b48692a4ebd 100644 --- a/dojo/finding/helper.py +++ b/dojo/finding/helper.py @@ -762,12 +762,15 @@ def add_endpoints(new_finding, form): endpoint=endpoint, defaults={"date": form.cleaned_data["date"] or timezone.now()}) -def save_vulnerability_ids(finding, vulnerability_ids): +def save_vulnerability_ids(finding, vulnerability_ids, *, delete_existing: bool = True): # Remove duplicates vulnerability_ids = list(dict.fromkeys(vulnerability_ids)) - # Remove old vulnerability ids - Vulnerability_Id.objects.filter(finding=finding).delete() + # Remove old vulnerability ids if requested + # Callers can set delete_existing=False when they know there are no existing IDs + # to avoid an unnecessary delete query (e.g., for new findings) + if delete_existing: + Vulnerability_Id.objects.filter(finding=finding).delete() # Save new vulnerability ids # Using bulk create throws Django 50 warnings about unsaved models... diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index ab10f1128ca..6933a2a28e4 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -31,7 +31,6 @@ Test_Import, Test_Import_Finding_Action, Test_Type, - Vulnerability_Id, ) from dojo.notifications.helper import create_notification from dojo.tag_utils import bulk_add_tags_to_instances @@ -796,36 +795,41 @@ def process_vulnerability_ids( to create `Vulnerability_Id` objects with the finding associated correctly. Only updates if vulnerability_ids have changed to avoid unnecessary database operations. + Note: The finding parameter can be: + - A new finding (from import) with unsaved_vulnerability_ids set from the parsed report + - An existing finding (from reimport) with unsaved_vulnerability_ids copied from the + finding_from_report before calling this method + Args: - finding: The finding to process vulnerability IDs for + finding: The finding to process vulnerability IDs for. + For reimports, this is the existing finding with unsaved_vulnerability_ids + set from the import/reimport report. Returns: The finding object """ - if finding.unsaved_vulnerability_ids: - # Only check for changes if the finding has been saved and might have existing vulnerability_ids - # For new findings, we always need to save vulnerability_ids - if finding.pk and finding.vulnerability_id_set.exists(): - # Check if vulnerability_ids have changed before updating - # Get existing vulnerability IDs from the database - existing_vuln_ids = set(finding.vulnerability_ids) if finding.vulnerability_ids else set() - # Normalize the new vulnerability IDs (remove duplicates for comparison) - new_vuln_ids = set(finding.unsaved_vulnerability_ids) - - # Only update if vulnerability IDs have changed - if existing_vuln_ids == new_vuln_ids: - logger.debug( - f"Skipping vulnerability_ids update for finding {finding.id} - " - f"vulnerability_ids unchanged: {sorted(existing_vuln_ids)}", - ) - return finding - - # Remove old vulnerability ids - keeping this call only because of flake8 - Vulnerability_Id.objects.filter(finding=finding).delete() - - # user the helper function - finding_helper.save_vulnerability_ids(finding, finding.unsaved_vulnerability_ids) + # Normalize to empty list if None - always process vulnerability IDs + vulnerability_ids_to_process = finding.unsaved_vulnerability_ids or [] + + # For existing findings, check if there are changes before updating + if finding.pk and len(finding.vulnerability_id_set.all()) > 0: + # Get existing vulnerability IDs from the database + existing_vuln_ids = set(finding.vulnerability_ids) if finding.vulnerability_ids else set() + # Normalize the new vulnerability IDs (remove duplicates for comparison) + new_vuln_ids = set(vulnerability_ids_to_process) + + # Only update if vulnerability IDs have changed + if existing_vuln_ids == new_vuln_ids: + logger.debug( + f"Skipping vulnerability_ids update for finding {finding.id} - " + f"vulnerability_ids unchanged: {sorted(existing_vuln_ids)}", + ) + return finding + + # Use the helper function which handles deletion and creation + # Always use delete_existing=True - it's a no-op if there are no existing IDs + finding_helper.save_vulnerability_ids(finding, vulnerability_ids_to_process, delete_existing=True) return finding diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 340f7c27cd0..1b8747f7083 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -851,7 +851,7 @@ def finding_post_processing( self, finding: Finding, finding_from_report: Finding, - ) -> None: + ) -> Finding: """ Save all associated objects to the finding after it has been saved for the purpose of foreign key restrictions @@ -871,10 +871,19 @@ def finding_post_processing( finding.unsaved_files = finding_from_report.unsaved_files self.process_files(finding) # Process vulnerability IDs - if finding_from_report.unsaved_vulnerability_ids: - finding.unsaved_vulnerability_ids = finding_from_report.unsaved_vulnerability_ids + # Copy unsaved_vulnerability_ids from the report finding to the existing finding + # so process_vulnerability_ids can process them (see its docstring for details) + # Always set it (even if empty list) so we can clear existing IDs when report has none + finding.unsaved_vulnerability_ids = finding_from_report.unsaved_vulnerability_ids or [] + # Store the current cve value to check if it changes + old_cve = finding.cve # legacy cve field has already been processed/set earlier - return self.process_vulnerability_ids(finding) + finding = self.process_vulnerability_ids(finding) + # Save the finding only if the cve field was changed by save_vulnerability_ids + # This is temporary as the cve field will be phased out + if finding.cve != old_cve: + finding.save() + return finding def process_groups_for_all_findings( self, diff --git a/unittests/scans/anchore_grype/check_all_fields_different_ids_fabricated.json b/unittests/scans/anchore_grype/check_all_fields_different_ids_fabricated.json new file mode 100644 index 00000000000..8520e8828dc --- /dev/null +++ b/unittests/scans/anchore_grype/check_all_fields_different_ids_fabricated.json @@ -0,0 +1,167 @@ +{ + "matches": [ + { + "vulnerability": { + "id": "GHSA-v6rh-hp5x-86rv", + "dataSource": "https://github.com/advisories/GHSA-v6rh-hp5x-86rv", + "namespace": "github:python", + "severity": "High", + "urls": [ + "https://github.com/advisories/GHSA-v6rh-hp5x-86rv" + ], + "description": "Potential bypass of an upstream access control based on URL paths in Django", + "cvss": [], + "fix": { + "versions": [ + "3.2.10" + ], + "state": "fixed" + }, + "advisories": [] + }, + "relatedVulnerabilities": [ + { + "id": "CVE-2021-1234", + "dataSource": "https://nvd.nist.gov/vuln/detail/CVE-2021-1234", + "namespace": "nvd", + "severity": "High", + "urls": [ + "https://example.com/cve-2021-1234" + ], + "description": "A different CVE for testing vulnerability ID changes", + "cvss": [ + { + "version": "3.1", + "vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H", + "metrics": { + "baseScore": 9.8, + "exploitabilityScore": 3.9, + "impactScore": 5.9 + }, + "vendorMetadata": {} + } + ] + }, + { + "id": "CVE-2021-5678", + "dataSource": "https://nvd.nist.gov/vuln/detail/CVE-2021-5678", + "namespace": "nvd", + "severity": "Medium", + "urls": [ + "https://example.com/cve-2021-5678" + ], + "description": "Another different CVE for testing vulnerability ID changes", + "cvss": [ + { + "version": "3.1", + "vector": "CVSS:3.1/AV:N/AC:L/PR:L/UI:N/S:U/C:L/I:L/A:L", + "metrics": { + "baseScore": 6.3, + "exploitabilityScore": 3.9, + "impactScore": 2.4 + }, + "vendorMetadata": {} + } + ] + } + ], + "matchDetails": [ + { + "matcher": "python-matcher", + "searchedBy": { + "language": "python", + "namespace": "github:python" + }, + "found": { + "versionConstraint": ">=3.2,<3.2.10 (python)" + } + } + ], + "artifact": { + "name": "Django", + "version": "3.2.9", + "type": "python", + "locations": [ + { + "path": "/usr/local/lib/python3.8/site-packages/Django-3.2.9.dist-info/METADATA", + "layerID": "sha256:b1d4455cf82b15a50b006fe87bd29f694c8f9155456253eb67fdd155b5edcf4a" + } + ], + "language": "python", + "licenses": [ + "BSD-3-Clause" + ], + "cpes": [ + "cpe:2.3:a:django_software_foundation:Django:3.2.9:*:*:*:*:*:*:*" + ], + "purl": "pkg:pypi/Django@3.2.9", + "metadata": null + } + } + ], + "source": { + "type": "image", + "target": { + "userInput": "vulnerable-image:latest", + "imageID": "sha256:ce9898fd214aef9c994a42624b09056bdce3ff4a8e3f68dc242d967b80fcbeee", + "manifestDigest": "sha256:9d8825ab20ac86b40eb71495bece1608a302fb180384740697a28c2b0a5a0fc6", + "mediaType": "application/vnd.docker.distribution.manifest.v2+json", + "tags": [ + "vulnerable-image:latest" + ], + "imageSize": 707381791, + "layers": [] + } + }, + "distro": { + "name": "debian", + "version": "10", + "idLike": "" + }, + "descriptor": { + "name": "grype", + "version": "0.28.0", + "configuration": { + "configPath": "", + "output": "json", + "file": "", + "output-template-file": "", + "quiet": false, + "check-for-app-update": true, + "only-fixed": false, + "scope": "Squashed", + "log": { + "structured": false, + "level": "", + "file": "" + }, + "db": { + "cache-dir": "/home/user/.cache/grype/db", + "update-url": "https://toolbox-data.anchore.io/grype/databases/listing.json", + "ca-cert": "", + "auto-update": true, + "validate-by-hash-on-start": false + }, + "dev": { + "profile-cpu": false, + "profile-mem": false + }, + "fail-on-severity": "", + "registry": { + "insecure-skip-tls-verify": false, + "insecure-use-http": false, + "auth": [] + }, + "ignore": null, + "exclude": [] + }, + "db": { + "built": "2021-12-24T08:14:02Z", + "schemaVersion": 3, + "location": "/home/user/.cache/grype/db/3", + "checksum": "sha256:6c4777e1acea787e5335ccee6b5e4562cd1767b9cca138c07e0802efb2a74162", + "error": null + } + } + } + diff --git a/unittests/scans/anchore_grype/check_all_fields_no_ids_fabricated.json b/unittests/scans/anchore_grype/check_all_fields_no_ids_fabricated.json new file mode 100644 index 00000000000..d4371d3a478 --- /dev/null +++ b/unittests/scans/anchore_grype/check_all_fields_no_ids_fabricated.json @@ -0,0 +1,122 @@ +{ + "matches": [ + { + "vulnerability": { + "id": "GHSA-v6rh-hp5x-86rv", + "dataSource": "https://github.com/advisories/GHSA-v6rh-hp5x-86rv", + "namespace": "github:python", + "severity": "High", + "urls": [ + "https://github.com/advisories/GHSA-v6rh-hp5x-86rv" + ], + "description": "Potential bypass of an upstream access control based on URL paths in Django", + "cvss": [], + "fix": { + "versions": [ + "3.2.10" + ], + "state": "fixed" + }, + "advisories": [] + }, + "relatedVulnerabilities": [], + "matchDetails": [ + { + "matcher": "python-matcher", + "searchedBy": { + "language": "python", + "namespace": "github:python" + }, + "found": { + "versionConstraint": ">=3.2,<3.2.10 (python)" + } + } + ], + "artifact": { + "name": "Django", + "version": "3.2.9", + "type": "python", + "locations": [ + { + "path": "/usr/local/lib/python3.8/site-packages/Django-3.2.9.dist-info/METADATA", + "layerID": "sha256:b1d4455cf82b15a50b006fe87bd29f694c8f9155456253eb67fdd155b5edcf4a" + } + ], + "language": "python", + "licenses": [ + "BSD-3-Clause" + ], + "cpes": [ + "cpe:2.3:a:django_software_foundation:Django:3.2.9:*:*:*:*:*:*:*" + ], + "purl": "pkg:pypi/Django@3.2.9", + "metadata": null + } + } + ], + "source": { + "type": "image", + "target": { + "userInput": "vulnerable-image:latest", + "imageID": "sha256:ce9898fd214aef9c994a42624b09056bdce3ff4a8e3f68dc242d967b80fcbeee", + "manifestDigest": "sha256:9d8825ab20ac86b40eb71495bece1608a302fb180384740697a28c2b0a5a0fc6", + "mediaType": "application/vnd.docker.distribution.manifest.v2+json", + "tags": [ + "vulnerable-image:latest" + ], + "imageSize": 707381791, + "layers": [] + } + }, + "distro": { + "name": "debian", + "version": "10", + "idLike": "" + }, + "descriptor": { + "name": "grype", + "version": "0.28.0", + "configuration": { + "configPath": "", + "output": "json", + "file": "", + "output-template-file": "", + "quiet": false, + "check-for-app-update": true, + "only-fixed": false, + "scope": "Squashed", + "log": { + "structured": false, + "level": "", + "file": "" + }, + "db": { + "cache-dir": "/home/user/.cache/grype/db", + "update-url": "https://toolbox-data.anchore.io/grype/databases/listing.json", + "ca-cert": "", + "auto-update": true, + "validate-by-hash-on-start": false + }, + "dev": { + "profile-cpu": false, + "profile-mem": false + }, + "fail-on-severity": "", + "registry": { + "insecure-skip-tls-verify": false, + "insecure-use-http": false, + "auth": [] + }, + "ignore": null, + "exclude": [] + }, + "db": { + "built": "2021-12-24T08:14:02Z", + "schemaVersion": 3, + "location": "/home/user/.cache/grype/db/3", + "checksum": "sha256:6c4777e1acea787e5335ccee6b5e4562cd1767b9cca138c07e0802efb2a74162", + "error": null + } + } + } + diff --git a/unittests/scans/anchore_grype/check_all_fields_with_ids_fabricated.json b/unittests/scans/anchore_grype/check_all_fields_with_ids_fabricated.json new file mode 100644 index 00000000000..028bd2c7b7b --- /dev/null +++ b/unittests/scans/anchore_grype/check_all_fields_with_ids_fabricated.json @@ -0,0 +1,158 @@ +{ + "matches": [ + { + "vulnerability": { + "id": "GHSA-v6rh-hp5x-86rv", + "dataSource": "https://github.com/advisories/GHSA-v6rh-hp5x-86rv", + "namespace": "github:python", + "severity": "High", + "urls": [ + "https://github.com/advisories/GHSA-v6rh-hp5x-86rv" + ], + "description": "Potential bypass of an upstream access control based on URL paths in Django", + "cvss": [], + "fix": { + "versions": [ + "3.2.10" + ], + "state": "fixed" + }, + "advisories": [] + }, + "relatedVulnerabilities": [ + { + "id": "CVE-2021-44420", + "dataSource": "https://nvd.nist.gov/vuln/detail/CVE-2021-44420", + "namespace": "nvd", + "severity": "High", + "urls": [ + "https://docs.djangoproject.com/en/3.2/releases/security/", + "https://www.openwall.com/lists/oss-security/2021/12/07/1", + "https://www.djangoproject.com/weblog/2021/dec/07/security-releases/", + "https://groups.google.com/forum/#!forum/django-announce" + ], + "description": "In Django 2.2 before 2.2.25, 3.1 before 3.1.14, and 3.2 before 3.2.10, HTTP requests for URLs with trailing newlines could bypass upstream access control based on URL paths.", + "cvss": [ + { + "version": "2.0", + "vector": "AV:N/AC:L/Au:N/C:P/I:P/A:P", + "metrics": { + "baseScore": 7.5, + "exploitabilityScore": 10, + "impactScore": 6.4 + }, + "vendorMetadata": {} + }, + { + "version": "3.1", + "vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:L/I:L/A:L", + "metrics": { + "baseScore": 7.3, + "exploitabilityScore": 3.9, + "impactScore": 3.4 + }, + "vendorMetadata": {} + } + ] + } + ], + "matchDetails": [ + { + "matcher": "python-matcher", + "searchedBy": { + "language": "python", + "namespace": "github:python" + }, + "found": { + "versionConstraint": ">=3.2,<3.2.10 (python)" + } + } + ], + "artifact": { + "name": "Django", + "version": "3.2.9", + "type": "python", + "locations": [ + { + "path": "/usr/local/lib/python3.8/site-packages/Django-3.2.9.dist-info/METADATA", + "layerID": "sha256:b1d4455cf82b15a50b006fe87bd29f694c8f9155456253eb67fdd155b5edcf4a" + } + ], + "language": "python", + "licenses": [ + "BSD-3-Clause" + ], + "cpes": [ + "cpe:2.3:a:django_software_foundation:Django:3.2.9:*:*:*:*:*:*:*" + ], + "purl": "pkg:pypi/Django@3.2.9", + "metadata": null + } + } + ], + "source": { + "type": "image", + "target": { + "userInput": "vulnerable-image:latest", + "imageID": "sha256:ce9898fd214aef9c994a42624b09056bdce3ff4a8e3f68dc242d967b80fcbeee", + "manifestDigest": "sha256:9d8825ab20ac86b40eb71495bece1608a302fb180384740697a28c2b0a5a0fc6", + "mediaType": "application/vnd.docker.distribution.manifest.v2+json", + "tags": [ + "vulnerable-image:latest" + ], + "imageSize": 707381791, + "layers": [] + } + }, + "distro": { + "name": "debian", + "version": "10", + "idLike": "" + }, + "descriptor": { + "name": "grype", + "version": "0.28.0", + "configuration": { + "configPath": "", + "output": "json", + "file": "", + "output-template-file": "", + "quiet": false, + "check-for-app-update": true, + "only-fixed": false, + "scope": "Squashed", + "log": { + "structured": false, + "level": "", + "file": "" + }, + "db": { + "cache-dir": "/home/user/.cache/grype/db", + "update-url": "https://toolbox-data.anchore.io/grype/databases/listing.json", + "ca-cert": "", + "auto-update": true, + "validate-by-hash-on-start": false + }, + "dev": { + "profile-cpu": false, + "profile-mem": false + }, + "fail-on-severity": "", + "registry": { + "insecure-skip-tls-verify": false, + "insecure-use-http": false, + "auth": [] + }, + "ignore": null, + "exclude": [] + }, + "db": { + "built": "2021-12-24T08:14:02Z", + "schemaVersion": 3, + "location": "/home/user/.cache/grype/db/3", + "checksum": "sha256:6c4777e1acea787e5335ccee6b5e4562cd1767b9cca138c07e0802efb2a74162", + "error": null + } + } + } + diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py index 539f879adae..ed00e015189 100644 --- a/unittests/test_import_reimport.py +++ b/unittests/test_import_reimport.py @@ -102,6 +102,9 @@ def __init__(self, *args, **kwargs): self.anchore_grype_file_name = get_unit_tests_scans_path("anchore_grype") / "check_all_fields.json" self.anchore_grype_file_name_fix_not_available = get_unit_tests_scans_path("anchore_grype") / "fix_not_available.json" self.anchore_grype_file_name_fix_available = get_unit_tests_scans_path("anchore_grype") / "fix_available.json" + self.anchore_grype_file_name_with_ids_fabricated = get_unit_tests_scans_path("anchore_grype") / "check_all_fields_with_ids_fabricated.json" + self.anchore_grype_file_name_no_ids_fabricated = get_unit_tests_scans_path("anchore_grype") / "check_all_fields_no_ids_fabricated.json" + self.anchore_grype_file_name_different_ids_fabricated = get_unit_tests_scans_path("anchore_grype") / "check_all_fields_different_ids_fabricated.json" self.anchore_grype_scan_type = "Anchore Grype" self.checkmarx_one_open_and_false_positive = get_unit_tests_scans_path("checkmarx_one") / "one-open-one-false-positive.json" @@ -1696,6 +1699,78 @@ def test_import_reimport_vulnerability_ids(self): self.assertEqual("GHSA-v6rh-hp5x-86rv", findings[3].vulnerability_ids[0]) self.assertEqual("CVE-2021-44420", findings[3].vulnerability_ids[1]) + def test_import_reimport_clear_vulnerability_ids(self): + """Test that vulnerability IDs are cleared when reimporting with no IDs""" + # Import scan with vulnerability IDs + import0 = self.import_scan_with_params(self.anchore_grype_file_name_with_ids_fabricated, scan_type=self.anchore_grype_scan_type) + + test_id = import0["test"] + test = Test.objects.get(id=test_id) + findings = Finding.objects.filter(test=test) + self.assertEqual(1, len(findings)) + finding = findings[0] + self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.cve) + self.assertEqual(2, len(finding.vulnerability_ids)) + self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.vulnerability_ids[0]) + self.assertEqual("CVE-2021-44420", finding.vulnerability_ids[1]) + + # Reimport with no vulnerability IDs - should clear existing IDs + test_type = Test_Type.objects.get(name=self.anchore_grype_scan_type) + reimport_test = Test( + engagement=test.engagement, + test_type=test_type, + scan_type=self.anchore_grype_scan_type, + target_start=datetime.now(timezone.get_current_timezone()), + target_end=datetime.now(timezone.get_current_timezone()), + ) + reimport_test.save() + + self.reimport_scan_with_params(reimport_test.id, self.anchore_grype_file_name_no_ids_fabricated, scan_type=self.anchore_grype_scan_type) + findings = Finding.objects.filter(test=reimport_test) + self.assertEqual(1, len(findings)) + finding = findings[0] + # After clearing, only the primary vuln_id should remain (GHSA-v6rh-hp5x-86rv) + # because the parser always includes the primary vulnerability.id + self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.cve) + self.assertEqual(1, len(finding.vulnerability_ids)) + self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.vulnerability_ids[0]) + + def test_import_reimport_change_vulnerability_ids(self): + """Test that vulnerability IDs are updated when reimporting with different IDs""" + # Import scan with initial vulnerability IDs + import0 = self.import_scan_with_params(self.anchore_grype_file_name_with_ids_fabricated, scan_type=self.anchore_grype_scan_type) + + test_id = import0["test"] + test = Test.objects.get(id=test_id) + findings = Finding.objects.filter(test=test) + self.assertEqual(1, len(findings)) + finding = findings[0] + self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.cve) + self.assertEqual(2, len(finding.vulnerability_ids)) + self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.vulnerability_ids[0]) + self.assertEqual("CVE-2021-44420", finding.vulnerability_ids[1]) + + # Reimport with different vulnerability IDs - should update existing IDs + test_type = Test_Type.objects.get(name=self.anchore_grype_scan_type) + reimport_test = Test( + engagement=test.engagement, + test_type=test_type, + scan_type=self.anchore_grype_scan_type, + target_start=datetime.now(timezone.get_current_timezone()), + target_end=datetime.now(timezone.get_current_timezone()), + ) + reimport_test.save() + + self.reimport_scan_with_params(reimport_test.id, self.anchore_grype_file_name_different_ids_fabricated, scan_type=self.anchore_grype_scan_type) + findings = Finding.objects.filter(test=reimport_test) + self.assertEqual(1, len(findings)) + finding = findings[0] + self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.cve) + self.assertEqual(3, len(finding.vulnerability_ids)) + self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.vulnerability_ids[0]) + self.assertEqual("CVE-2021-1234", finding.vulnerability_ids[1]) + self.assertEqual("CVE-2021-5678", finding.vulnerability_ids[2]) + def test_import_reimport_fix_available(self): import0 = self.import_scan_with_params(self.anchore_grype_file_name_fix_not_available, scan_type=self.anchore_grype_scan_type) test_id = import0["test"] diff --git a/unittests/test_importers_importer.py b/unittests/test_importers_importer.py index cc5fb342df7..e2c8c8fc757 100644 --- a/unittests/test_importers_importer.py +++ b/unittests/test_importers_importer.py @@ -7,7 +7,16 @@ from rest_framework.test import APIClient from dojo.importers.default_importer import DefaultImporter -from dojo.models import Development_Environment, Engagement, Finding, Product, Product_Type, Test, User +from dojo.models import ( + Development_Environment, + Engagement, + Finding, + Product, + Product_Type, + Test, + User, + Vulnerability_Id, +) from dojo.tools.gitlab_sast.parser import GitlabSastParser from dojo.tools.sarif.parser import SarifParser from dojo.utils import get_object_or_none @@ -614,3 +623,77 @@ def test_no_handle_vulnerability_ids_references_and_no_cve(self, mock): self.assertEqual(finding.unsaved_vulnerability_ids, None) self.assertEqual(finding.vulnerability_ids, []) finding.delete() + + def test_clear_vulnerability_ids_on_empty_list(self): + """Test that vulnerability IDs are cleared when an empty list is provided""" + # Create a finding with existing vulnerability IDs + finding = Finding() + finding.test = self.test + finding.reporter = self.testuser + finding.save() + + # Add some vulnerability IDs + Vulnerability_Id.objects.create(finding=finding, vulnerability_id="CVE-2020-1234") + Vulnerability_Id.objects.create(finding=finding, vulnerability_id="CVE-2020-5678") + finding.cve = "CVE-2020-1234" + finding.save() + + # Verify initial state + self.assertEqual(2, len(finding.vulnerability_ids)) + self.assertEqual("CVE-2020-1234", finding.cve) + + # Process with empty list - should clear all IDs + finding.unsaved_vulnerability_ids = [] + DefaultImporter(**self.importer_data).process_vulnerability_ids(finding) + # Save the finding to persist the cve=None change + finding.save() + + # Get fresh finding from database to avoid cached property issues + finding = Finding.objects.get(pk=finding.pk) + + # Verify IDs are cleared + self.assertEqual(0, len(finding.vulnerability_ids)) + self.assertEqual(None, finding.cve) + # Verify no Vulnerability_Id objects exist for this finding + self.assertEqual(0, Vulnerability_Id.objects.filter(finding=finding).count()) + finding.delete() + + def test_change_vulnerability_ids_on_reimport(self): + """Test that vulnerability IDs are updated when different IDs are provided""" + # Create a finding with existing vulnerability IDs + finding = Finding() + finding.test = self.test + finding.reporter = self.testuser + finding.save() + + # Add initial vulnerability IDs + Vulnerability_Id.objects.create(finding=finding, vulnerability_id="CVE-2020-1234") + Vulnerability_Id.objects.create(finding=finding, vulnerability_id="CVE-2020-5678") + finding.cve = "CVE-2020-1234" + finding.save() + + # Verify initial state + self.assertEqual(2, len(finding.vulnerability_ids)) + self.assertEqual("CVE-2020-1234", finding.vulnerability_ids[0]) + self.assertEqual("CVE-2020-5678", finding.vulnerability_ids[1]) + self.assertEqual("CVE-2020-1234", finding.cve) + + # Process with different IDs - should replace old IDs + new_vulnerability_ids = ["CVE-2021-9999", "GHSA-xxxx-yyyy"] + finding.unsaved_vulnerability_ids = new_vulnerability_ids + DefaultImporter(**self.importer_data).process_vulnerability_ids(finding) + # Save the finding to persist the cve change + finding.save() + + # Get fresh finding from database to avoid cached property issues + finding = Finding.objects.get(pk=finding.pk) + + # Verify old IDs are removed and new IDs are present + self.assertEqual(2, len(finding.vulnerability_ids)) + self.assertEqual("CVE-2021-9999", finding.vulnerability_ids[0]) + self.assertEqual("GHSA-xxxx-yyyy", finding.vulnerability_ids[1]) + self.assertEqual("CVE-2021-9999", finding.cve) + # Verify only new Vulnerability_Id objects exist + vuln_ids = list(Vulnerability_Id.objects.filter(finding=finding).values_list("vulnerability_id", flat=True)) + self.assertEqual(set(new_vulnerability_ids), set(vuln_ids)) + finding.delete() From 382fd216a3008f439ab4348f610311214a5d76d0 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 01:27:41 +0100 Subject: [PATCH 09/26] vulnerability_id processing: separate import/reimport --- dojo/importers/base_importer.py | 38 ++++-------------------- dojo/importers/default_importer.py | 2 +- dojo/importers/default_reimporter.py | 39 +++++++++++++++++++++++-- unittests/test_importers_importer.py | 25 +++++++--------- unittests/test_importers_performance.py | 12 ++++---- 5 files changed, 60 insertions(+), 56 deletions(-) diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index 6933a2a28e4..b295028ef27 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -786,51 +786,23 @@ def process_cve( return finding - def process_vulnerability_ids( + def store_vulnerability_ids( self, finding: Finding, ) -> Finding: """ - Parse the `unsaved_vulnerability_ids` field from findings after they are parsed - to create `Vulnerability_Id` objects with the finding associated correctly. - Only updates if vulnerability_ids have changed to avoid unnecessary database operations. - - Note: The finding parameter can be: - - A new finding (from import) with unsaved_vulnerability_ids set from the parsed report - - An existing finding (from reimport) with unsaved_vulnerability_ids copied from the - finding_from_report before calling this method + Store vulnerability IDs for a finding. + Reads from finding.unsaved_vulnerability_ids and saves them overwriting existing ones. Args: - finding: The finding to process vulnerability IDs for. - For reimports, this is the existing finding with unsaved_vulnerability_ids - set from the import/reimport report. + finding: The finding to store vulnerability IDs for Returns: The finding object """ - # Normalize to empty list if None - always process vulnerability IDs vulnerability_ids_to_process = finding.unsaved_vulnerability_ids or [] - - # For existing findings, check if there are changes before updating - if finding.pk and len(finding.vulnerability_id_set.all()) > 0: - # Get existing vulnerability IDs from the database - existing_vuln_ids = set(finding.vulnerability_ids) if finding.vulnerability_ids else set() - # Normalize the new vulnerability IDs (remove duplicates for comparison) - new_vuln_ids = set(vulnerability_ids_to_process) - - # Only update if vulnerability IDs have changed - if existing_vuln_ids == new_vuln_ids: - logger.debug( - f"Skipping vulnerability_ids update for finding {finding.id} - " - f"vulnerability_ids unchanged: {sorted(existing_vuln_ids)}", - ) - return finding - - # Use the helper function which handles deletion and creation - # Always use delete_existing=True - it's a no-op if there are no existing IDs - finding_helper.save_vulnerability_ids(finding, vulnerability_ids_to_process, delete_existing=True) - + finding_helper.save_vulnerability_ids(finding, vulnerability_ids_to_process, delete_existing=False) return finding def process_files( diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py index 63f41b8f744..e1e3f6f4556 100644 --- a/dojo/importers/default_importer.py +++ b/dojo/importers/default_importer.py @@ -234,7 +234,7 @@ def process_findings( # Process any files self.process_files(finding) # Process vulnerability IDs - finding = self.process_vulnerability_ids(finding) + finding = self.store_vulnerability_ids(finding) # Categorize this finding as a new one new_findings.append(finding) # all data is already saved on the finding, we only need to trigger post processing in batches diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 1b8747f7083..80aad31bab0 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -847,6 +847,41 @@ def process_finding_that_was_not_matched( self.process_request_response_pairs(unsaved_finding) return unsaved_finding + def reconcile_vulnerability_ids( + self, + finding: Finding, + ) -> Finding: + """ + Reconcile vulnerability IDs for an existing finding. + Checks if IDs have changed before updating to avoid unnecessary database operations. + Uses prefetched data if available, otherwise fetches efficiently. + + Args: + finding: The existing finding to reconcile vulnerability IDs for. + Must have unsaved_vulnerability_ids set. + + Returns: + The finding object + + """ + vulnerability_ids_to_process = finding.unsaved_vulnerability_ids or [] + + # Use prefetched data directly without triggering queries + existing_vuln_ids = {v.vulnerability_id for v in finding.vulnerability_id_set.all()} + new_vuln_ids = set(vulnerability_ids_to_process) + + # Early exit if unchanged + if existing_vuln_ids == new_vuln_ids: + logger.debug( + f"Skipping vulnerability_ids update for finding {finding.id} - " + f"vulnerability_ids unchanged: {sorted(existing_vuln_ids)}", + ) + return finding + + # Update if changed + finding_helper.save_vulnerability_ids(finding, vulnerability_ids_to_process, delete_existing=True) + return finding + def finding_post_processing( self, finding: Finding, @@ -872,13 +907,13 @@ def finding_post_processing( self.process_files(finding) # Process vulnerability IDs # Copy unsaved_vulnerability_ids from the report finding to the existing finding - # so process_vulnerability_ids can process them (see its docstring for details) + # so reconcile_vulnerability_ids can process them # Always set it (even if empty list) so we can clear existing IDs when report has none finding.unsaved_vulnerability_ids = finding_from_report.unsaved_vulnerability_ids or [] # Store the current cve value to check if it changes old_cve = finding.cve # legacy cve field has already been processed/set earlier - finding = self.process_vulnerability_ids(finding) + finding = self.reconcile_vulnerability_ids(finding) # Save the finding only if the cve field was changed by save_vulnerability_ids # This is temporary as the cve field will be phased out if finding.cve != old_cve: diff --git a/unittests/test_importers_importer.py b/unittests/test_importers_importer.py index e2c8c8fc757..6e526ec0d92 100644 --- a/unittests/test_importers_importer.py +++ b/unittests/test_importers_importer.py @@ -7,6 +7,7 @@ from rest_framework.test import APIClient from dojo.importers.default_importer import DefaultImporter +from dojo.importers.default_reimporter import DefaultReImporter from dojo.models import ( Development_Environment, Engagement, @@ -562,8 +563,7 @@ def create_default_data(self): "scan_type": NPM_AUDIT_SCAN_TYPE, } - @patch("dojo.importers.base_importer.Vulnerability_Id", autospec=True) - def test_handle_vulnerability_ids_references_and_cve(self, mock): + def test_handle_vulnerability_ids_references_and_cve(self): # Why doesn't this test use the test db and query for one? vulnerability_ids = ["CVE", "REF-1", "REF-2"] finding = Finding() @@ -571,7 +571,7 @@ def test_handle_vulnerability_ids_references_and_cve(self, mock): finding.test = self.test finding.reporter = self.testuser finding.save() - DefaultImporter(**self.importer_data).process_vulnerability_ids(finding) + DefaultImporter(**self.importer_data).store_vulnerability_ids(finding) self.assertEqual("CVE", finding.vulnerability_ids[0]) self.assertEqual("CVE", finding.cve) @@ -580,8 +580,7 @@ def test_handle_vulnerability_ids_references_and_cve(self, mock): self.assertEqual("REF-2", finding.vulnerability_ids[2]) finding.delete() - @patch("dojo.importers.base_importer.Vulnerability_Id", autospec=True) - def test_handle_no_vulnerability_ids_references_and_cve(self, mock): + def test_handle_no_vulnerability_ids_references_and_cve(self): vulnerability_ids = ["CVE"] finding = Finding() finding.test = self.test @@ -589,22 +588,21 @@ def test_handle_no_vulnerability_ids_references_and_cve(self, mock): finding.save() finding.unsaved_vulnerability_ids = vulnerability_ids - DefaultImporter(**self.importer_data).process_vulnerability_ids(finding) + DefaultImporter(**self.importer_data).store_vulnerability_ids(finding) self.assertEqual("CVE", finding.vulnerability_ids[0]) self.assertEqual("CVE", finding.cve) self.assertEqual(vulnerability_ids, finding.unsaved_vulnerability_ids) finding.delete() - @patch("dojo.importers.base_importer.Vulnerability_Id", autospec=True) - def test_handle_vulnerability_ids_references_and_no_cve(self, mock): + def test_handle_vulnerability_ids_references_and_no_cve(self): vulnerability_ids = ["REF-1", "REF-2"] finding = Finding() finding.test = self.test finding.reporter = self.testuser finding.save() finding.unsaved_vulnerability_ids = vulnerability_ids - DefaultImporter(**self.importer_data).process_vulnerability_ids(finding) + DefaultImporter(**self.importer_data).store_vulnerability_ids(finding) self.assertEqual("REF-1", finding.vulnerability_ids[0]) self.assertEqual("REF-1", finding.cve) @@ -612,13 +610,12 @@ def test_handle_vulnerability_ids_references_and_no_cve(self, mock): self.assertEqual("REF-2", finding.vulnerability_ids[1]) finding.delete() - @patch("dojo.importers.base_importer.Vulnerability_Id", autospec=True) - def test_no_handle_vulnerability_ids_references_and_no_cve(self, mock): + def test_no_handle_vulnerability_ids_references_and_no_cve(self): finding = Finding() finding.test = self.test finding.reporter = self.testuser finding.save() - DefaultImporter(**self.importer_data).process_vulnerability_ids(finding) + DefaultImporter(**self.importer_data).store_vulnerability_ids(finding) self.assertEqual(finding.cve, None) self.assertEqual(finding.unsaved_vulnerability_ids, None) self.assertEqual(finding.vulnerability_ids, []) @@ -644,7 +641,7 @@ def test_clear_vulnerability_ids_on_empty_list(self): # Process with empty list - should clear all IDs finding.unsaved_vulnerability_ids = [] - DefaultImporter(**self.importer_data).process_vulnerability_ids(finding) + DefaultReImporter(test=self.test, environment=self.importer_data["environment"], scan_type=self.importer_data["scan_type"]).reconcile_vulnerability_ids(finding) # Save the finding to persist the cve=None change finding.save() @@ -681,7 +678,7 @@ def test_change_vulnerability_ids_on_reimport(self): # Process with different IDs - should replace old IDs new_vulnerability_ids = ["CVE-2021-9999", "GHSA-xxxx-yyyy"] finding.unsaved_vulnerability_ids = new_vulnerability_ids - DefaultImporter(**self.importer_data).process_vulnerability_ids(finding) + DefaultReImporter(test=self.test, environment=self.importer_data["environment"], scan_type=self.importer_data["scan_type"]).reconcile_vulnerability_ids(finding) # Save the finding to persist the cve change finding.save() diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index c422140a433..1667ac7c04d 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -244,7 +244,7 @@ def test_import_reimport_reimport_performance_async(self): self._import_reimport_performance( expected_num_queries1=340, expected_num_async_tasks1=7, - expected_num_queries2=275, + expected_num_queries2=276, expected_num_async_tasks2=18, expected_num_queries3=163, expected_num_async_tasks3=17, @@ -262,7 +262,7 @@ def test_import_reimport_reimport_performance_pghistory_async(self): self._import_reimport_performance( expected_num_queries1=306, expected_num_async_tasks1=7, - expected_num_queries2=268, + expected_num_queries2=269, expected_num_async_tasks2=18, expected_num_queries3=158, expected_num_async_tasks3=17, @@ -286,7 +286,7 @@ def test_import_reimport_reimport_performance_no_async(self): self._import_reimport_performance( expected_num_queries1=346, expected_num_async_tasks1=6, - expected_num_queries2=281, + expected_num_queries2=282, expected_num_async_tasks2=17, expected_num_queries3=169, expected_num_async_tasks3=16, @@ -308,7 +308,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): self._import_reimport_performance( expected_num_queries1=312, expected_num_async_tasks1=6, - expected_num_queries2=274, + expected_num_queries2=275, expected_num_async_tasks2=17, expected_num_queries3=164, expected_num_async_tasks3=16, @@ -334,7 +334,7 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self self._import_reimport_performance( expected_num_queries1=348, expected_num_async_tasks1=8, - expected_num_queries2=283, + expected_num_queries2=284, expected_num_async_tasks2=19, expected_num_queries3=171, expected_num_async_tasks3=18, @@ -357,7 +357,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr self._import_reimport_performance( expected_num_queries1=314, expected_num_async_tasks1=8, - expected_num_queries2=276, + expected_num_queries2=277, expected_num_async_tasks2=19, expected_num_queries3=166, expected_num_async_tasks3=18, From 406b69869239954162452e9de58c918f8a0f5f8e Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sat, 13 Dec 2025 19:38:54 +0100 Subject: [PATCH 10/26] reimport: add management command to reimport sample scans --- .../commands/reimport_unittest_scan.py | 228 ++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 dojo/management/commands/reimport_unittest_scan.py diff --git a/dojo/management/commands/reimport_unittest_scan.py b/dojo/management/commands/reimport_unittest_scan.py new file mode 100644 index 00000000000..8480962b3bb --- /dev/null +++ b/dojo/management/commands/reimport_unittest_scan.py @@ -0,0 +1,228 @@ +import json +import logging +import time +from importlib import import_module +from importlib.util import find_spec +from inspect import isclass +from pathlib import Path + +from django.core.management.base import BaseCommand, CommandError +from django.urls import reverse +from rest_framework.authtoken.models import Token +from rest_framework.test import APIClient + +from unittests.test_dashboard import User + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + + help = ( + "Reimport a specific unittest scan by filename into an existing test. " + "Automatically deduces scan type from path." + ) + + def add_arguments(self, parser): + parser.add_argument( + "test_id", + type=int, + help="ID of the test to reimport into", + ) + parser.add_argument( + "scan_file", + type=str, + help="Path to scan file relative to unittests/scans/ (e.g., 'jfrog_xray_unified/very_many_vulns.json')", + ) + parser.add_argument( + "--minimum-severity", + type=str, + default="Low", + choices=["Critical", "High", "Medium", "Low", "Info"], + help="Minimum severity to import (default: Low)", + ) + parser.add_argument( + "--active", + action="store_true", + default=True, + help="Mark findings as active (default: True)", + ) + parser.add_argument( + "--verified", + action="store_true", + default=False, + help="Mark findings as verified (default: False)", + ) + parser.add_argument( + "--close-old-findings", + action="store_true", + default=False, + help="Close findings not present in the new scan (default: False)", + ) + parser.add_argument( + "--tags", + action="append", + default=[], + help=( + "Tag(s) to apply to the test (repeat --tags to add multiple). " + "Example: --tags perf --tags jfrog" + ), + ) + + def get_test_admin(self): + return User.objects.get(username="admin") + + def reimport_scan(self, payload, expected_http_status_code=201): + testuser = self.get_test_admin() + token = Token.objects.get(user=testuser) + client = APIClient() + client.credentials(HTTP_AUTHORIZATION="Token " + token.key) + + response = client.post(reverse("reimportscan-list"), payload) + if expected_http_status_code != response.status_code: + msg = f"Expected HTTP status code {expected_http_status_code}, got {response.status_code}: {response.content[:1000]}" + raise CommandError(msg) + return json.loads(response.content) + + def deduce_scan_type_from_path(self, scan_file_path): + """ + Deduce the scan type from the file path by finding the corresponding parser. + + Args: + scan_file_path: Path like 'zap/zap_sample.json' or 'jfrog_xray_unified/very_many_vulns.json' + + Returns: + tuple: (scan_type, parser_class) or raises CommandError if not found + + """ + # Extract the directory name (parser module name) + path_parts = Path(scan_file_path).parts + if len(path_parts) < 2: + msg = f"Scan file path must include directory: {scan_file_path}" + raise CommandError(msg) + + module_name = path_parts[0] + + # Try to find and load the parser module + try: + if not find_spec(f"dojo.tools.{module_name}.parser"): + msg = f"No parser module found for '{module_name}'" + raise CommandError(msg) + + module = import_module(f"dojo.tools.{module_name}.parser") + + # Find the parser class + parser_class = None + expected_class_name = module_name.replace("_", "") + "parser" + + for attribute_name in dir(module): + attribute = getattr(module, attribute_name) + if isclass(attribute) and attribute_name.lower() == expected_class_name: + parser_class = attribute + break + + if not parser_class: + msg = f"No parser class found in module '{module_name}'" + raise CommandError(msg) + + # Get the scan type from the parser + parser_instance = parser_class() + scan_types = parser_instance.get_scan_types() + + if not scan_types: + msg = f"Parser '{module_name}' has no scan types" + raise CommandError(msg) + + return scan_types[0], parser_class + + except ImportError as e: + msg = f"Failed to import parser module '{module_name}': {e}" + raise CommandError(msg) + + def reimport_unittest_scan(self, test_id, scan_file, minimum_severity, active, verified, close_old_findings, tags): + """ + Reimport a specific unittest scan file into an existing test. + + Args: + test_id: ID of the test to reimport into + scan_file: Path to scan file relative to unittests/scans/ + minimum_severity: Minimum severity level + active: Whether findings should be active + verified: Whether findings should be verified + close_old_findings: Whether to close findings not in the new scan + tags: List of tags to apply + + """ + # Validate scan file exists + scan_path = Path("unittests/scans") / scan_file + if not scan_path.exists(): + msg = f"Scan file not found: {scan_path}" + raise CommandError(msg) + + # Deduce scan type from path + scan_type, _parser_class = self.deduce_scan_type_from_path(scan_file) + + logger.info(f"Reimporting scan '{scan_file}' using scan type '{scan_type}'") + logger.info(f"Target: Test ID {test_id}") + + # Reimport the scan + with scan_path.open(encoding="utf-8") as testfile: + payload = { + "test": test_id, + "minimum_severity": minimum_severity, + "scan_type": scan_type, + "file": testfile, + "version": "1.0.1", + "active": active, + "verified": verified, + "close_old_findings": close_old_findings, + } + + if tags: + payload["tags"] = tags + + result = self.reimport_scan(payload) + + logger.info(f"Successfully reimported scan. Test ID: {result.get('test')}") + logger.info(f"Reimport summary: {result.get('scan_save_message', 'No summary available')}") + + return result + + def handle(self, *args, **options): + test_id = options["test_id"] + scan_file = options["scan_file"] + minimum_severity = options["minimum_severity"] + active = options["active"] + verified = options["verified"] + close_old_findings = options["close_old_findings"] + tags = options["tags"] + + start_time = time.time() + + try: + self.reimport_unittest_scan( + test_id=test_id, + scan_file=scan_file, + minimum_severity=minimum_severity, + active=active, + verified=verified, + close_old_findings=close_old_findings, + tags=tags, + ) + + end_time = time.time() + duration = end_time - start_time + + self.stdout.write( + self.style.SUCCESS( + f"Successfully reimported '{scan_file}' into test ID {test_id} " + f"(took {duration:.2f} seconds)", + ), + ) + + except Exception as e: + end_time = time.time() + duration = end_time - start_time + logger.exception(f"Failed to reimport scan '{scan_file}' after {duration:.2f} seconds") + msg = f"Reimport failed after {duration:.2f} seconds: {e}" + raise CommandError(msg) From 9cef33add90bbff155d495167917b00817d5e853 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 02:10:54 +0100 Subject: [PATCH 11/26] reimport: prefetch endpoint status --- dojo/finding/deduplication.py | 12 +++++++++++- unittests/test_importers_performance.py | 24 ++++++++++++------------ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py index b397add4efd..10c5bf9e10f 100644 --- a/dojo/finding/deduplication.py +++ b/dojo/finding/deduplication.py @@ -252,10 +252,20 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None): ) queryset = Finding.objects.filter(scope_q) + # Base prefetches for both modes + prefetch_list = ["endpoints", "vulnerability_id_set"] + + # Additional prefetches for reimport mode + if mode == "reimport": + prefetch_list.extend([ + "status_finding", + "status_finding__endpoint", + ]) + return ( queryset .select_related("test", "test__engagement", "test__test_type") - .prefetch_related("endpoints", "vulnerability_id_set") + .prefetch_related(*prefetch_list) ) diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 1667ac7c04d..b94cd0d2277 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -244,9 +244,9 @@ def test_import_reimport_reimport_performance_async(self): self._import_reimport_performance( expected_num_queries1=340, expected_num_async_tasks1=7, - expected_num_queries2=276, + expected_num_queries2=238, expected_num_async_tasks2=18, - expected_num_queries3=163, + expected_num_queries3=120, expected_num_async_tasks3=17, ) @@ -262,9 +262,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self): self._import_reimport_performance( expected_num_queries1=306, expected_num_async_tasks1=7, - expected_num_queries2=269, + expected_num_queries2=231, expected_num_async_tasks2=18, - expected_num_queries3=158, + expected_num_queries3=115, expected_num_async_tasks3=17, ) @@ -286,9 +286,9 @@ def test_import_reimport_reimport_performance_no_async(self): self._import_reimport_performance( expected_num_queries1=346, expected_num_async_tasks1=6, - expected_num_queries2=282, + expected_num_queries2=244, expected_num_async_tasks2=17, - expected_num_queries3=169, + expected_num_queries3=126, expected_num_async_tasks3=16, ) @@ -308,9 +308,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): self._import_reimport_performance( expected_num_queries1=312, expected_num_async_tasks1=6, - expected_num_queries2=275, + expected_num_queries2=237, expected_num_async_tasks2=17, - expected_num_queries3=164, + expected_num_queries3=121, expected_num_async_tasks3=16, ) @@ -334,9 +334,9 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self self._import_reimport_performance( expected_num_queries1=348, expected_num_async_tasks1=8, - expected_num_queries2=284, + expected_num_queries2=246, expected_num_async_tasks2=19, - expected_num_queries3=171, + expected_num_queries3=128, expected_num_async_tasks3=18, ) @@ -357,9 +357,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr self._import_reimport_performance( expected_num_queries1=314, expected_num_async_tasks1=8, - expected_num_queries2=277, + expected_num_queries2=239, expected_num_async_tasks2=19, - expected_num_queries3=166, + expected_num_queries3=123, expected_num_async_tasks3=18, ) From 9464ce46b385d01e6a76a7fb635d151da21b7c73 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 10:40:15 +0100 Subject: [PATCH 12/26] reimport: bullk update endpoint statuses --- dojo/importers/endpoint_manager.py | 43 +++++++++++++++++-------- unittests/test_importers_performance.py | 12 +++---- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/dojo/importers/endpoint_manager.py b/dojo/importers/endpoint_manager.py index ccfff345c40..f4b277d49fa 100644 --- a/dojo/importers/endpoint_manager.py +++ b/dojo/importers/endpoint_manager.py @@ -68,6 +68,7 @@ def mitigate_endpoint_status( ) -> None: """Mitigates all endpoint status objects that are supplied""" now = timezone.now() + to_update = [] for endpoint_status in endpoint_status_list: # Only mitigate endpoints that are actually active if endpoint_status.mitigated is False: @@ -75,7 +76,14 @@ def mitigate_endpoint_status( endpoint_status.last_modified = now endpoint_status.mitigated_by = user endpoint_status.mitigated = True - endpoint_status.save() + to_update.append(endpoint_status) + + if to_update: + Endpoint_Status.objects.bulk_update( + to_update, + ["mitigated", "mitigated_time", "last_modified", "mitigated_by"], + batch_size=1000, + ) @dojo_async_task @app.task() @@ -85,6 +93,8 @@ def reactivate_endpoint_status( **kwargs: dict, ) -> None: """Reactivate all endpoint status objects that are supplied""" + now = timezone.now() + to_update = [] for endpoint_status in endpoint_status_list: # Only reactivate endpoints that are actually mitigated if endpoint_status.mitigated: @@ -92,8 +102,15 @@ def reactivate_endpoint_status( endpoint_status.mitigated_by = None endpoint_status.mitigated_time = None endpoint_status.mitigated = False - endpoint_status.last_modified = timezone.now() - endpoint_status.save() + endpoint_status.last_modified = now + to_update.append(endpoint_status) + + if to_update: + Endpoint_Status.objects.bulk_update( + to_update, + ["mitigated", "mitigated_time", "mitigated_by", "last_modified"], + batch_size=1000, + ) def chunk_endpoints_and_disperse( self, @@ -149,17 +166,17 @@ def update_endpoint_status( # New finding is mitigated, so mitigate all old endpoints endpoint_status_to_mitigate = existing_finding_endpoint_status_list else: + # Convert to set for O(1) lookups instead of O(n) linear search + new_finding_endpoints_set = set(new_finding_endpoints_list) # Mitigate any endpoints in the old finding not found in the new finding - endpoint_status_to_mitigate = list( - filter( - lambda existing_finding_endpoint_status: existing_finding_endpoint_status.endpoint not in new_finding_endpoints_list, - existing_finding_endpoint_status_list), - ) + endpoint_status_to_mitigate = [ + eps for eps in existing_finding_endpoint_status_list + if eps.endpoint not in new_finding_endpoints_set + ] # Re-activate any endpoints in the old finding that are in the new finding - endpoint_status_to_reactivate = list( - filter( - lambda existing_finding_endpoint_status: existing_finding_endpoint_status.endpoint in new_finding_endpoints_list, - existing_finding_endpoint_status_list), - ) + endpoint_status_to_reactivate = [ + eps for eps in existing_finding_endpoint_status_list + if eps.endpoint in new_finding_endpoints_set + ] self.chunk_endpoints_and_reactivate(endpoint_status_to_reactivate) self.chunk_endpoints_and_mitigate(endpoint_status_to_mitigate, user) diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index b94cd0d2277..5f1af63f1ab 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -246,7 +246,7 @@ def test_import_reimport_reimport_performance_async(self): expected_num_async_tasks1=7, expected_num_queries2=238, expected_num_async_tasks2=18, - expected_num_queries3=120, + expected_num_queries3=118, expected_num_async_tasks3=17, ) @@ -264,7 +264,7 @@ def test_import_reimport_reimport_performance_pghistory_async(self): expected_num_async_tasks1=7, expected_num_queries2=231, expected_num_async_tasks2=18, - expected_num_queries3=115, + expected_num_queries3=113, expected_num_async_tasks3=17, ) @@ -288,7 +288,7 @@ def test_import_reimport_reimport_performance_no_async(self): expected_num_async_tasks1=6, expected_num_queries2=244, expected_num_async_tasks2=17, - expected_num_queries3=126, + expected_num_queries3=124, expected_num_async_tasks3=16, ) @@ -310,7 +310,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): expected_num_async_tasks1=6, expected_num_queries2=237, expected_num_async_tasks2=17, - expected_num_queries3=121, + expected_num_queries3=119, expected_num_async_tasks3=16, ) @@ -336,7 +336,7 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self expected_num_async_tasks1=8, expected_num_queries2=246, expected_num_async_tasks2=19, - expected_num_queries3=128, + expected_num_queries3=126, expected_num_async_tasks3=18, ) @@ -359,7 +359,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr expected_num_async_tasks1=8, expected_num_queries2=239, expected_num_async_tasks2=19, - expected_num_queries3=123, + expected_num_queries3=121, expected_num_async_tasks3=18, ) From e81d1decb56cfc3424a3840f3a9deb146042d65c Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 11:21:47 +0100 Subject: [PATCH 13/26] add script to update performance test counts easily --- scripts/update_performance_test_counts.py | 324 ++++++++++++++++++++++ unittests/test_importers_performance.py | 20 ++ 2 files changed, 344 insertions(+) create mode 100644 scripts/update_performance_test_counts.py diff --git a/scripts/update_performance_test_counts.py b/scripts/update_performance_test_counts.py new file mode 100644 index 00000000000..f63e040fb3a --- /dev/null +++ b/scripts/update_performance_test_counts.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python3 +""" +Script to update performance test query counts. +I tried to implement this as management command, but it would become very slow on parsing the test output. + +This script: +1. Runs all performance tests and captures actual query counts +2. Compares them with expected counts +3. Generates a report and optionally updates the test file +4. Provides a verification run + +How to run: + + # Step 1: Run tests and generate report (uses TestDojoImporterPerformanceSmall by default) + python3 scripts/update_performance_test_counts.py --report-only + + # Or specify a different test class: + python3 scripts/update_performance_test_counts.py --test-class TestDojoImporterPerformanceSmall --report-only + + # Step 2: Review the report, then update the file + python3 scripts/update_performance_test_counts.py --update + + # Step 3: Verify all tests pass + python3 scripts/update_performance_test_counts.py --verify + +The script defaults to TestDojoImporterPerformanceSmall if --test-class is not provided. +""" + +import argparse +import re +import subprocess +import sys +from pathlib import Path + +# Path to the test file +TEST_FILE = Path(__file__).parent.parent / "unittests" / "test_importers_performance.py" + + +class TestCount: + + """Represents a test's expected and actual counts.""" + + def __init__(self, test_name: str, step: str, metric: str): + self.test_name = test_name + self.step = step + self.metric = metric + self.expected = None + self.actual = None + self.difference = None + + def __repr__(self): + return ( + f"TestCount({self.test_name}, {self.step}, {self.metric}, " + f"expected={self.expected}, actual={self.actual})" + ) + + +def run_tests(test_class: str) -> str: + """Run the performance tests and return the output.""" + print(f"Running tests for {test_class}...") + cmd = [ + "./run-unittest.sh", + "--test-case", + f"unittests.test_importers_performance.{test_class}", + ] + result = subprocess.run( + cmd, check=False, capture_output=True, text=True, cwd=Path(__file__).parent.parent, + ) + return result.stdout + result.stderr + + +def parse_test_output(output: str) -> list[TestCount]: + """Parse test output to extract actual vs expected counts.""" + counts = [] + + # Pattern to match AssertionError with query counts + # Format: AssertionError: 118 != 120 : 118 queries executed, 120 expected + assertion_pattern = re.compile( + r"AssertionError: (\d+) != (\d+) : (\d+) queries executed, (\d+) expected", + ) + + # Pattern to match test name and step + # Format: (step='reimport2', metric='queries') + step_pattern = re.compile(r"step='(\w+)', metric='(\w+)'") + + # Pattern to match test name + # Format: FAIL: test_import_reimport_reimport_performance_async + test_pattern = re.compile(r"FAIL: (test_\w+)") + + lines = output.split("\n") + current_test = None + current_step = None + current_metric = None + + for line in lines: + # Match test name + test_match = test_pattern.search(line) + if test_match: + current_test = test_match.group(1) + + # Match step and metric + step_match = step_pattern.search(line) + if step_match: + current_step = step_match.group(1) + current_metric = step_match.group(2) + + # Match assertion error + assertion_match = assertion_pattern.search(line) + if assertion_match and current_test and current_step and current_metric: + actual = int(assertion_match.group(1)) + expected = int(assertion_match.group(2)) + count = TestCount(current_test, current_step, current_metric) + count.actual = actual + count.expected = expected + count.difference = expected - actual + counts.append(count) + + return counts + + +def extract_expected_counts_from_file(test_class: str) -> dict[str, dict[str, int]]: + """Extract expected counts from the test file.""" + if not TEST_FILE.exists(): + msg = f"Test file not found: {TEST_FILE}" + raise FileNotFoundError(msg) + + content = TEST_FILE.read_text() + + # Pattern to match test method calls with expected counts + # Format: self._import_reimport_performance( + # expected_num_queries1=340, + # expected_num_async_tasks1=7, + # expected_num_queries2=238, + # expected_num_async_tasks2=18, + # expected_num_queries3=120, + # expected_num_async_tasks3=17, + # ) + # More flexible pattern that handles whitespace variations + pattern = re.compile( + r"def (test_\w+)\([^)]*\):.*?" + r"self\._import_reimport_performance\(\s*" + r"expected_num_queries1\s*=\s*(\d+)\s*,\s*" + r"expected_num_async_tasks1\s*=\s*(\d+)\s*,\s*" + r"expected_num_queries2\s*=\s*(\d+)\s*,\s*" + r"expected_num_async_tasks2\s*=\s*(\d+)\s*,\s*" + r"expected_num_queries3\s*=\s*(\d+)\s*,\s*" + r"expected_num_async_tasks3\s*=\s*(\d+)\s*,", + re.DOTALL, + ) + + expected_counts = {} + for match in pattern.finditer(content): + test_name = match.group(1) + expected_counts[test_name] = { + "import1_queries": int(match.group(2)), + "import1_async_tasks": int(match.group(3)), + "reimport1_queries": int(match.group(4)), + "reimport1_async_tasks": int(match.group(5)), + "reimport2_queries": int(match.group(6)), + "reimport2_async_tasks": int(match.group(7)), + } + + return expected_counts + + +def generate_report(counts: list[TestCount], expected_counts: dict[str, dict[str, int]]): + """Generate a report of differences.""" + if not counts: + print("✅ All tests passed! No count differences found.") + return + + print("\n" + "=" * 80) + print("PERFORMANCE TEST COUNT DIFFERENCES REPORT") + print("=" * 80 + "\n") + + # Group by test name + by_test = {} + for count in counts: + if count.test_name not in by_test: + by_test[count.test_name] = [] + by_test[count.test_name].append(count) + + for test_name, test_counts in sorted(by_test.items()): + print(f"Test: {test_name}") + print("-" * 80) + for count in sorted(test_counts, key=lambda x: (x.step, x.metric)): + print( + f" {count.step:12} {count.metric:15} " + f"Expected: {count.expected:4} → Actual: {count.actual:4} " + f"(Difference: {count.difference:+3})", + ) + print() + + print("=" * 80) + print("\nTo update the test file, run:") + print(f" python scripts/update_performance_test_counts.py --test-class {test_name.split('_')[0]} --update") + print() + + +def update_test_file(counts: list[TestCount]): + """Update the test file with new expected counts.""" + if not counts: + print("No counts to update.") + return + + content = TEST_FILE.read_text() + + # Create a mapping of test_name -> step_metric -> new_value + updates = {} + for count in counts: + if count.test_name not in updates: + updates[count.test_name] = {} + step_metric = f"{count.step}_{count.metric}" + updates[count.test_name][step_metric] = count.actual + + # Update each test method + for test_name, test_updates in updates.items(): + # Find the test method and update expected_num_queries/async_tasks values + # Map step_metric to parameter name + param_map = { + "import1_queries": "expected_num_queries1", + "import1_async_tasks": "expected_num_async_tasks1", + "reimport1_queries": "expected_num_queries2", + "reimport1_async_tasks": "expected_num_async_tasks2", + "reimport2_queries": "expected_num_queries3", + "reimport2_async_tasks": "expected_num_async_tasks3", + } + + for step_metric, new_value in test_updates.items(): + param_name = param_map.get(step_metric) + if param_name: + # Pattern to match: expected_num_queries3=120, (with flexible whitespace) + pattern = re.compile( + rf"(def {re.escape(test_name)}\([^)]*\):.*?{re.escape(param_name)}\s*=\s*)(\d+)(\s*,)", + re.DOTALL, + ) + + def replace(match): + return f"{match.group(1)}{new_value}{match.group(3)}" # noqa: B023 + + content = pattern.sub(replace, content) + + # Write back to file + TEST_FILE.write_text(content) + print(f"✅ Updated {TEST_FILE}") + print(f" Updated {len(counts)} count(s) across {len(updates)} test(s)") + + +def verify_tests(test_class: str) -> bool: + """Run tests to verify they all pass.""" + print(f"Verifying tests for {test_class}...") + output = run_tests(test_class) + counts = parse_test_output(output) + + if counts: + print("❌ Some tests still have count mismatches:") + for count in counts: + print(f" {count.test_name} - {count.step} {count.metric}: " + f"expected {count.expected}, got {count.actual}") + return False + else: # noqa: RET505 + print("✅ All tests pass!") + return True + + +def main(): + parser = argparse.ArgumentParser( + description="Update performance test query counts", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--test-class", + required=False, + default="TestDojoImporterPerformanceSmall", + help="Test class name (e.g., TestDojoImporterPerformanceSmall). Defaults to TestDojoImporterPerformanceSmall if not provided.", + ) + parser.add_argument( + "--report-only", + action="store_true", + help="Only generate a report, don't update the file", + ) + parser.add_argument( + "--update", + action="store_true", + help="Update the test file with new counts", + ) + parser.add_argument( + "--verify", + action="store_true", + help="Run tests to verify they pass", + ) + + args = parser.parse_args() + + if args.report_only: + # Step 1: Run tests and generate report + output = run_tests(args.test_class) + counts = parse_test_output(output) + expected_counts = extract_expected_counts_from_file(args.test_class) + generate_report(counts, expected_counts) + + elif args.update: + # Step 2: Update the file + output = run_tests(args.test_class) + counts = parse_test_output(output) + if counts: + update_test_file(counts) + print("\nNext step: Run --verify to ensure all tests pass") + else: + print("No differences found. All tests are already up to date.") + + elif args.verify: + # Step 3: Verify + success = verify_tests(args.test_class) + sys.exit(0 if success else 1) + + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 5f1af63f1ab..4b94d132bf6 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -1,3 +1,23 @@ +""" +Performance tests for importers. + +These tests verify that import and reimport operations maintain acceptable query counts +and async task counts to prevent performance regressions. + +Counts can be updated via the Python script at scripts/update_performance_test_counts.py. +However, counts must be verified to ensure no implicit performance regressions are introduced. +When counts change, review the differences carefully to determine if they represent: +- Legitimate optimizations (counts decreasing) +- Acceptable changes due to feature additions (counts increasing with justification) +- Unintended performance regressions (counts increasing without clear reason) + +Always verify updated counts by: +1. Running the update script to see the differences +2. Reviewing the changes to understand why counts changed +3. Running the verification step to ensure all tests pass +4. Investigating any unexpected increases in query or task counts +""" + import logging from contextlib import contextmanager From 8a57c38c87615300c616f4115a66bee22a1836b2 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 11:22:45 +0100 Subject: [PATCH 14/26] add script to update performance test counts easily --- ruff.toml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ruff.toml b/ruff.toml index 670a95b7b99..cfd67b5a501 100644 --- a/ruff.toml +++ b/ruff.toml @@ -84,7 +84,7 @@ select = [ "PGH", "PLC", "PLE", - "PLR01", "PLR02", "PLR04", "PLR0915", "PLR1711", "PLR1704", "PLR1714", "PLR1716", "PLR172", "PLR173", "PLR2044", "PLR5", "PLR6104", "PLR6201", + "PLR01", "PLR02", "PLR04", "PLR0915", "PLR1711", "PLR1704", "PLR1714", "PLR1716", "PLR172", "PLR173", "PLR2044", "PLR5", "PLR6104", "PLR6201", "PLW", "UP", "FURB", @@ -118,6 +118,14 @@ preview = true "dojo/filters.py" = [ "A003", # ruff upgrade to 0.13.3 ] +"scripts/update_performance_test_counts.py" = [ + "S603", # subprocess.run without shell=True is safe for this script + "S604", # subprocess.run without shell=True is safe for this script + "S605", # subprocess.run without shell=True is safe for this script + "S606", # subprocess.run without shell=True is safe for this script + "S607", # subprocess.run without shell=True is safe for this script + "T201", # print statements are fine for console output scripts +] [lint.flake8-boolean-trap] extend-allowed-calls = ["dojo.utils.get_system_setting"] From 3c42df18b1e4f0cdfa6bce5431edcdf3abfb5056 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 11:37:07 +0100 Subject: [PATCH 15/26] 8a57c38c87615300c616f4115a66bee22a1836b2 --- ruff.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/ruff.toml b/ruff.toml index cfd67b5a501..942bd47a00a 100644 --- a/ruff.toml +++ b/ruff.toml @@ -125,6 +125,7 @@ preview = true "S606", # subprocess.run without shell=True is safe for this script "S607", # subprocess.run without shell=True is safe for this script "T201", # print statements are fine for console output scripts + "EXE001", # git won't commit the executable flag I don't know why ] [lint.flake8-boolean-trap] From 943faf8d95cebd89be6f455075f8a026d24f5b5d Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 11:41:17 +0100 Subject: [PATCH 16/26] add upgrade notes --- docs/content/en/open_source/upgrading/2.54.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/content/en/open_source/upgrading/2.54.md b/docs/content/en/open_source/upgrading/2.54.md index 3286e7ee149..06741d7d9f2 100644 --- a/docs/content/en/open_source/upgrading/2.54.md +++ b/docs/content/en/open_source/upgrading/2.54.md @@ -5,7 +5,20 @@ weight: -20250804 description: Dropped support for DD_PARSER_EXCLUDE --- +## Breaking change: `DD_PARSER_EXCLUDE` removed + To simplify the management of the DefectDojo application, parser exclusions are no longer controlled via the environment variable DD_PARSER_EXCLUDE or application settings. This variable is now unsupported. From now on, you should use the active flag in the Test_Type model to enable or disable parsers. Only parsers associated with active Test_Type entries will be available for use. -There are other instructions for upgrading to 2.54.x. Check the Release Notes for the contents of the release. +## Import/reimport performance improvements + +DefectDojo 2.54.x includes performance improvements for importing and reimporting scan results, especially for large scans: + +- **Faster imports and reimports** due to fewer database queries and more bulk operations. +- **Reduced database load** during reimport matching and post-processing (helps avoid slowdowns/timeouts under heavy scan volume). +- **More efficient endpoint status updates** during reimport of dynamic findings. +- **Less churn when updating vulnerability IDs**, avoiding unnecessary deletes/writes when nothing changed. + +No action is required after upgrading. (Optional tuning knobs exist via `DD_IMPORT_REIMPORT_MATCH_BATCH_SIZE` and `DD_IMPORT_REIMPORT_DEDUPE_BATCH_SIZE`.) + +There are other instructions for upgrading to 2.54.x. Check the Release Notes for the contents of the release: `https://github.com/DefectDojo/django-DefectDojo/releases/tag/2.54.0` From bb40290764b80bd4166532c4302484471ad65523 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 11:41:50 +0100 Subject: [PATCH 17/26] add upgrade notes --- docs/content/en/open_source/upgrading/2.54.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/content/en/open_source/upgrading/2.54.md b/docs/content/en/open_source/upgrading/2.54.md index 06741d7d9f2..82f52ede847 100644 --- a/docs/content/en/open_source/upgrading/2.54.md +++ b/docs/content/en/open_source/upgrading/2.54.md @@ -2,7 +2,7 @@ title: 'Upgrading to DefectDojo Version 2.54.x' toc_hide: true weight: -20250804 -description: Dropped support for DD_PARSER_EXCLUDE +description: Dropped support for DD_PARSER_EXCLUDE & Reimport performance improvements --- ## Breaking change: `DD_PARSER_EXCLUDE` removed From 5ea1cb14ab7ba6fe0bcd4a144f12886e796441de Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 16:50:26 +0100 Subject: [PATCH 18/26] remove obsolete method --- dojo/finding/deduplication.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py index 10c5bf9e10f..7c4b766e8f2 100644 --- a/dojo/finding/deduplication.py +++ b/dojo/finding/deduplication.py @@ -269,15 +269,6 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None): ) -# TODO: remove this if that doesn't affect Pro -def build_dedupe_scope_queryset(test): - """ - Legacy function name for backward compatibility. - Use build_candidate_scope_queryset() instead. - """ - return build_candidate_scope_queryset(test, mode="deduplication") - - def find_candidates_for_deduplication_hash(test, findings, mode="deduplication", service=None): """ Find candidates by hash_code. Works for both deduplication and reimport. From 58d2abd48007645057d156701131a9dec3669751 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 17:23:48 +0100 Subject: [PATCH 19/26] remove obsolete method --- dojo/finding/deduplication.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py index 7c4b766e8f2..dd075be1c08 100644 --- a/dojo/finding/deduplication.py +++ b/dojo/finding/deduplication.py @@ -376,7 +376,7 @@ def find_candidates_for_deduplication_uid_or_hash(test, findings, mode="deduplic def find_candidates_for_deduplication_legacy(test, findings): - base_queryset = build_dedupe_scope_queryset(test) + base_queryset = build_candidate_scope_queryset(test, mode="deduplication") titles = {f.title for f in findings if getattr(f, "title", None)} cwes = {f.cwe for f in findings if getattr(f, "cwe", 0)} cwes.discard(0) From daac9c86c8b08a70cd3581f67d9846ab5eefae38 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 18:32:08 +0100 Subject: [PATCH 20/26] reimport: extract method to get candidates --- dojo/importers/default_reimporter.py | 63 ++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 80aad31bab0..97b7959d56d 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -159,6 +159,48 @@ def process_scan( test_import_history, ) + def get_reimport_match_candidates_for_batch( + self, + batch_findings: list[Finding], + ) -> tuple[dict, dict, dict]: + """ + Fetch candidate matches for a batch of *unsaved* findings during reimport. + + This is intentionally a separate method so downstream editions (e.g. Dojo Pro) + can override candidate retrieval without copying the full `process_findings()` + implementation. + + Returns: + (candidates_by_hash, candidates_by_uid, candidates_by_key) + + """ + candidates_by_hash: dict = {} + candidates_by_uid: dict = {} + candidates_by_key: dict = {} + + if self.deduplication_algorithm == "hash_code": + candidates_by_hash = find_candidates_for_deduplication_hash( + self.test, + batch_findings, + mode="reimport", + ) + elif self.deduplication_algorithm == "unique_id_from_tool": + candidates_by_uid = find_candidates_for_deduplication_unique_id( + self.test, + batch_findings, + mode="reimport", + ) + elif self.deduplication_algorithm == "unique_id_from_tool_or_hash_code": + candidates_by_uid, candidates_by_hash = find_candidates_for_deduplication_uid_or_hash( + self.test, + batch_findings, + mode="reimport", + ) + elif self.deduplication_algorithm == "legacy": + candidates_by_key = find_candidates_for_reimport_legacy(self.test, batch_findings) + + return candidates_by_hash, candidates_by_uid, candidates_by_key + def process_findings( self, parsed_findings: list[Finding], @@ -242,24 +284,9 @@ def process_findings( deduplicationLogger.debug(f"unsaved finding's hash_code: {unsaved_finding.hash_code}") # Fetch all candidates for this batch at once (batch candidate finding) - candidates_by_hash = {} - candidates_by_uid = {} - candidates_by_key = {} - - if self.deduplication_algorithm == "hash_code": - candidates_by_hash = find_candidates_for_deduplication_hash( - self.test, batch_findings, mode="reimport", - ) - elif self.deduplication_algorithm == "unique_id_from_tool": - candidates_by_uid = find_candidates_for_deduplication_unique_id( - self.test, batch_findings, mode="reimport", - ) - elif self.deduplication_algorithm == "unique_id_from_tool_or_hash_code": - candidates_by_uid, candidates_by_hash = find_candidates_for_deduplication_uid_or_hash( - self.test, batch_findings, mode="reimport", - ) - elif self.deduplication_algorithm == "legacy": - candidates_by_key = find_candidates_for_reimport_legacy(self.test, batch_findings) + candidates_by_hash, candidates_by_uid, candidates_by_key = self.get_reimport_match_candidates_for_batch( + batch_findings, + ) # Process each finding in the batch using pre-fetched candidates for idx, unsaved_finding in enumerate(batch_findings): From edc34bb46cd96296e293dbd2f9239b9e83db63dc Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 23:18:24 +0100 Subject: [PATCH 21/26] reimport: remove fallback to non-batch --- dojo/importers/default_reimporter.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 97b7959d56d..1abc9dadf77 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -550,27 +550,23 @@ def match_finding_for_reimport( deduplicationLogger.debug("matching finding for reimport using algorithm: %s", self.deduplication_algorithm) if self.deduplication_algorithm == "hash_code": - if candidates_by_hash is None: - # Fallback to individual query if candidates not provided - return self.match_new_finding_to_existing_finding(unsaved_finding) - if unsaved_finding.hash_code is None: + if candidates_by_hash is None or unsaved_finding.hash_code is None: return [] matches = candidates_by_hash.get(unsaved_finding.hash_code, []) return sorted(matches, key=lambda f: f.id) if self.deduplication_algorithm == "unique_id_from_tool": - if candidates_by_uid is None: - # Fallback to individual query if candidates not provided - return self.match_new_finding_to_existing_finding(unsaved_finding) - if unsaved_finding.unique_id_from_tool is None: + if candidates_by_uid is None or unsaved_finding.unique_id_from_tool is None: return [] matches = candidates_by_uid.get(unsaved_finding.unique_id_from_tool, []) return sorted(matches, key=lambda f: f.id) if self.deduplication_algorithm == "unique_id_from_tool_or_hash_code": - if candidates_by_hash is None or candidates_by_uid is None: - # Fallback to individual query if candidates not provided - return self.match_new_finding_to_existing_finding(unsaved_finding) + if candidates_by_hash is None and candidates_by_uid is None: + return [] + + if unsaved_finding.hash_code is None and unsaved_finding.unique_id_from_tool is None: + return [] # Collect matches from both hash_code and unique_id_from_tool matches_by_id = {} @@ -589,10 +585,7 @@ def match_finding_for_reimport( return sorted(matches, key=lambda f: f.id) if self.deduplication_algorithm == "legacy": - if candidates_by_key is None: - # Fallback to individual query if candidates not provided - return self.match_new_finding_to_existing_finding(unsaved_finding) - if not unsaved_finding.title: + if candidates_by_key is None or not unsaved_finding.title: return [] key = (unsaved_finding.title.lower(), unsaved_finding.severity) matches = candidates_by_key.get(key, []) From 3c15ebc29dd4d671c5287fd025550cb5001c2243 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 23:18:42 +0100 Subject: [PATCH 22/26] reimport: prep for Pro overrides --- dojo/importers/base_importer.py | 1 + dojo/importers/default_reimporter.py | 117 +++++++++++++-------------- 2 files changed, 56 insertions(+), 62 deletions(-) diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py index b295028ef27..5007004c635 100644 --- a/dojo/importers/base_importer.py +++ b/dojo/importers/base_importer.py @@ -271,6 +271,7 @@ def determine_process_method( def determine_deduplication_algorithm(self) -> str: """ Determines what dedupe algorithm to use for the Test being processed. + Overridden in Pro. :return: A string representing the dedupe algorithm to use. """ return self.test.deduplication_algorithm diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 1abc9dadf77..40d1120d12c 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -170,6 +170,8 @@ def get_reimport_match_candidates_for_batch( can override candidate retrieval without copying the full `process_findings()` implementation. + Is overridden in Pro. + Returns: (candidates_by_hash, candidates_by_uid, candidates_by_key) @@ -201,6 +203,51 @@ def get_reimport_match_candidates_for_batch( return candidates_by_hash, candidates_by_uid, candidates_by_key + def add_new_finding_to_candidates( + self, + finding: Finding, + candidates_by_hash: dict, + candidates_by_uid: dict, + candidates_by_key: dict, + ) -> None: + """ + Add a newly created finding to candidate dictionaries for subsequent findings in the same batch. + + This allows duplicates within the same scan report to be detected even when they're processed + in the same batch. When a new finding is created (no match found), it is added to the candidate + dictionaries so that subsequent findings in the same batch can match against it. + + This is intentionally a separate method so downstream editions (e.g. Dojo Pro) + can override candidate addition logic without copying the full `process_findings()` + implementation. + + Args: + finding: The newly created finding to add to candidates + candidates_by_hash: Dictionary mapping hash_code to list of findings (modified in-place) + candidates_by_uid: Dictionary mapping unique_id_from_tool to list of findings (modified in-place) + candidates_by_key: Dictionary mapping (title_lower, severity) to list of findings (modified in-place) + + """ + if not finding: + return + + if finding.hash_code: + candidates_by_hash.setdefault(finding.hash_code, []).append(finding) + deduplicationLogger.debug( + f"Added finding {finding.id} (hash_code: {finding.hash_code}) to candidates for next findings in this report", + ) + if finding.unique_id_from_tool: + candidates_by_uid.setdefault(finding.unique_id_from_tool, []).append(finding) + deduplicationLogger.debug( + f"Added finding {finding.id} (unique_id_from_tool: {finding.unique_id_from_tool}) to candidates for next findings in this report", + ) + if finding.title: + legacy_key = (finding.title.lower(), finding.severity) + candidates_by_key.setdefault(legacy_key, []).append(finding) + deduplicationLogger.debug( + f"Added finding {finding.id} (title: {finding.title}, severity: {finding.severity}) to candidates for next findings in this report", + ) + def process_findings( self, parsed_findings: list[Finding], @@ -293,7 +340,7 @@ def process_findings( is_final = is_final_batch and idx == len(batch_findings) - 1 # Match any findings to this new one coming in using pre-fetched candidates - matched_findings = self.match_finding_for_reimport( + matched_findings = self.match_finding_to_candidate_reimport( unsaved_finding, candidates_by_hash=candidates_by_hash, candidates_by_uid=candidates_by_uid, @@ -325,23 +372,12 @@ def process_findings( finding = self.process_finding_that_was_not_matched(unsaved_finding) # Add newly created finding to candidates for subsequent findings in this batch - if finding: - if finding.hash_code: - candidates_by_hash.setdefault(finding.hash_code, []).append(finding) - deduplicationLogger.debug( - f"Added finding {finding.id} (hash_code: {finding.hash_code}) to candidates for next findings in this report", - ) - if finding.unique_id_from_tool: - candidates_by_uid.setdefault(finding.unique_id_from_tool, []).append(finding) - deduplicationLogger.debug( - f"Added finding {finding.id} (unique_id_from_tool: {finding.unique_id_from_tool}) to candidates for next findings in this report", - ) - if finding.title: - legacy_key = (finding.title.lower(), finding.severity) - candidates_by_key.setdefault(legacy_key, []).append(finding) - deduplicationLogger.debug( - f"Added finding {finding.id} (title: {finding.title}, severity: {finding.severity}) to candidates for next findings in this report", - ) + self.add_new_finding_to_candidates( + finding, + candidates_by_hash, + candidates_by_uid, + candidates_by_key, + ) # This condition __appears__ to always be true, but am afraid to remove it if finding: @@ -483,50 +519,7 @@ def parse_findings_dynamic_test_type( logger.debug("REIMPORT_SCAN parser v2: Create parse findings") return super().parse_findings_dynamic_test_type(scan, parser) - def match_new_finding_to_existing_finding( - self, - unsaved_finding: Finding, - ) -> list[Finding]: - """Matches a single new finding to N existing findings and then returns those matches""" - # This code should match the logic used for deduplication out of the re-import feature. - # See utils.py deduplicate_* functions - deduplicationLogger.debug("return findings bases on algorithm: %s", self.deduplication_algorithm) - if self.deduplication_algorithm == "hash_code": - return Finding.objects.filter( - test=self.test, - hash_code=unsaved_finding.hash_code, - ).exclude(hash_code=None).order_by("id") - if self.deduplication_algorithm == "unique_id_from_tool": - deduplicationLogger.debug(f"unique_id_from_tool: {unsaved_finding.unique_id_from_tool}") - return Finding.objects.filter( - test=self.test, - unique_id_from_tool=unsaved_finding.unique_id_from_tool, - ).exclude(unique_id_from_tool=None).order_by("id") - if self.deduplication_algorithm == "unique_id_from_tool_or_hash_code": - deduplicationLogger.debug(f"unique_id_from_tool: {unsaved_finding.unique_id_from_tool}") - deduplicationLogger.debug(f"hash_code: {unsaved_finding.hash_code}") - query = Finding.objects.filter( - Q(test=self.test), - (Q(hash_code__isnull=False) & Q(hash_code=unsaved_finding.hash_code)) - | (Q(unique_id_from_tool__isnull=False) & Q(unique_id_from_tool=unsaved_finding.unique_id_from_tool)), - ).order_by("id") - deduplicationLogger.debug(query.query) - return query - if self.deduplication_algorithm == "legacy": - # This is the legacy reimport behavior. Although it's pretty flawed and doesn't match the legacy algorithm for deduplication, - # this is left as is for simplicity. - # Re-writing the legacy deduplication here would be complicated and counter-productive. - # If you have use cases going through this section, you're advised to create a deduplication configuration for your parser - logger.warning("Legacy reimport. In case of issue, you're advised to create a deduplication configuration in order not to go through this section") - return Finding.objects.filter( - title__iexact=unsaved_finding.title, - test=self.test, - severity=unsaved_finding.severity, - numerical_severity=Finding.get_numerical_severity(unsaved_finding.severity)).order_by("id") - logger.error(f'Internal error: unexpected deduplication_algorithm: "{self.deduplication_algorithm}"') - return None - - def match_finding_for_reimport( + def match_finding_to_candidate_reimport( self, unsaved_finding: Finding, candidates_by_hash: dict | None = None, From 0d99393b836d941edc491fbd75a94d22d53abbf9 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 23:22:53 +0100 Subject: [PATCH 23/26] add comment --- dojo/importers/default_reimporter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 40d1120d12c..9546156f578 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -221,6 +221,8 @@ def add_new_finding_to_candidates( can override candidate addition logic without copying the full `process_findings()` implementation. + Is overriden in Pro + Args: finding: The newly created finding to add to candidates candidates_by_hash: Dictionary mapping hash_code to list of findings (modified in-place) From 197fe43bc5d7ab7e4f9e4da05da2a715aa58f08c Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Sun, 14 Dec 2025 23:23:22 +0100 Subject: [PATCH 24/26] add comment --- dojo/importers/default_reimporter.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py index 9546156f578..14a7da7e21d 100644 --- a/dojo/importers/default_reimporter.py +++ b/dojo/importers/default_reimporter.py @@ -217,10 +217,6 @@ def add_new_finding_to_candidates( in the same batch. When a new finding is created (no match found), it is added to the candidate dictionaries so that subsequent findings in the same batch can match against it. - This is intentionally a separate method so downstream editions (e.g. Dojo Pro) - can override candidate addition logic without copying the full `process_findings()` - implementation. - Is overriden in Pro Args: From e8f60991cc0c57414ce304286262e569d81c0703 Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Wed, 17 Dec 2025 14:46:14 +0100 Subject: [PATCH 25/26] update counts and script --- scripts/update_performance_test_counts.py | 130 +++++++++++++++++----- unittests/test_importers_performance.py | 16 +-- 2 files changed, 110 insertions(+), 36 deletions(-) diff --git a/scripts/update_performance_test_counts.py b/scripts/update_performance_test_counts.py index f63e040fb3a..867c01bb280 100644 --- a/scripts/update_performance_test_counts.py +++ b/scripts/update_performance_test_counts.py @@ -11,19 +11,20 @@ How to run: - # Step 1: Run tests and generate report (uses TestDojoImporterPerformanceSmall by default) - python3 scripts/update_performance_test_counts.py --report-only + # Default: Update the test file (uses TestDojoImporterPerformanceSmall by default) + python3 scripts/update_performance_test_counts.py # Or specify a different test class: - python3 scripts/update_performance_test_counts.py --test-class TestDojoImporterPerformanceSmall --report-only + python3 scripts/update_performance_test_counts.py --test-class TestDojoImporterPerformanceSmall - # Step 2: Review the report, then update the file - python3 scripts/update_performance_test_counts.py --update + # Step 1: Run tests and generate report only (without updating) + python3 scripts/update_performance_test_counts.py --report-only - # Step 3: Verify all tests pass + # Step 2: Verify all tests pass python3 scripts/update_performance_test_counts.py --verify The script defaults to TestDojoImporterPerformanceSmall if --test-class is not provided. +The script defaults to --update behavior if no action flag is provided. """ import argparse @@ -55,18 +56,80 @@ def __repr__(self): ) -def run_tests(test_class: str) -> str: - """Run the performance tests and return the output.""" +def run_tests(test_class: str) -> tuple[str, int]: + """Run the performance tests and return the output and return code.""" print(f"Running tests for {test_class}...") cmd = [ "./run-unittest.sh", "--test-case", f"unittests.test_importers_performance.{test_class}", ] - result = subprocess.run( - cmd, check=False, capture_output=True, text=True, cwd=Path(__file__).parent.parent, + + # Run with real-time output streaming + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + cwd=Path(__file__).parent.parent, ) - return result.stdout + result.stderr + + output_lines = [] + for line in process.stdout: + print(line, end="") # Print in real-time + output_lines.append(line) + + process.wait() + output = "".join(output_lines) + + return output, process.returncode + + +def check_test_execution_success(output: str, return_code: int) -> tuple[bool, str]: + """Check if tests executed successfully or failed due to other reasons.""" + # Check for migration errors + migration_error_patterns = [ + r"django\.db\.migrations\.exceptions\.", + r"Migration.*failed", + r"django\.core\.management\.base\.CommandError", + r"OperationalError", + r"ProgrammingError", + r"relation.*does not exist", + r"no such table", + ] + + for pattern in migration_error_patterns: + if re.search(pattern, output, re.IGNORECASE): + return False, f"Migration or database error detected: {pattern}" + + # Check if any tests actually ran + test_run_patterns = [ + r"Ran \d+ test", + r"OK", + r"FAILED", + r"FAIL:", + r"test_\w+", + ] + + tests_ran = any(re.search(pattern, output) for pattern in test_run_patterns) + + if not tests_ran and return_code != 0: + return False, "Tests did not run successfully. Check the output above for errors." + + # Check for other critical errors + critical_error_patterns = [ + r"ImportError", + r"ModuleNotFoundError", + r"SyntaxError", + r"IndentationError", + ] + + for pattern in critical_error_patterns: + if re.search(pattern, output): + return False, f"Critical error detected: {pattern}" + + return True, "" def parse_test_output(output: str) -> list[TestCount]: @@ -249,17 +312,23 @@ def replace(match): def verify_tests(test_class: str) -> bool: """Run tests to verify they all pass.""" print(f"Verifying tests for {test_class}...") - output = run_tests(test_class) + output, return_code = run_tests(test_class) + + success, error_msg = check_test_execution_success(output, return_code) + if not success: + print(f"\n❌ Test execution failed: {error_msg}") + return False + counts = parse_test_output(output) if counts: - print("❌ Some tests still have count mismatches:") + print("\n❌ Some tests still have count mismatches:") for count in counts: print(f" {count.test_name} - {count.step} {count.metric}: " f"expected {count.expected}, got {count.actual}") return False else: # noqa: RET505 - print("✅ All tests pass!") + print("\n✅ All tests pass!") return True @@ -283,7 +352,7 @@ def main(): parser.add_argument( "--update", action="store_true", - help="Update the test file with new counts", + help="Update the test file with new counts (default behavior if no action flag is provided)", ) parser.add_argument( "--verify", @@ -295,29 +364,34 @@ def main(): if args.report_only: # Step 1: Run tests and generate report - output = run_tests(args.test_class) + output, return_code = run_tests(args.test_class) + success, error_msg = check_test_execution_success(output, return_code) + if not success: + print(f"\n❌ Test execution failed: {error_msg}") + sys.exit(1) counts = parse_test_output(output) expected_counts = extract_expected_counts_from_file(args.test_class) generate_report(counts, expected_counts) - elif args.update: - # Step 2: Update the file - output = run_tests(args.test_class) - counts = parse_test_output(output) - if counts: - update_test_file(counts) - print("\nNext step: Run --verify to ensure all tests pass") - else: - print("No differences found. All tests are already up to date.") - elif args.verify: # Step 3: Verify success = verify_tests(args.test_class) sys.exit(0 if success else 1) else: - parser.print_help() - sys.exit(1) + # Default: Update the file (--update is the default behavior) + output, return_code = run_tests(args.test_class) + success, error_msg = check_test_execution_success(output, return_code) + if not success: + print(f"\n❌ Test execution failed: {error_msg}") + print("Cannot update counts because tests did not run successfully.") + sys.exit(1) + counts = parse_test_output(output) + if counts: + update_test_file(counts) + print("\nNext step: Run --verify to ensure all tests pass") + else: + print("\nNo differences found. All tests are already up to date.") if __name__ == "__main__": diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index 81cc0935594..b2dc1c07192 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -267,9 +267,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self): self._import_reimport_performance( expected_num_queries1=306, expected_num_async_tasks1=7, - expected_num_queries2=231, + expected_num_queries2=232, expected_num_async_tasks2=18, - expected_num_queries3=113, + expected_num_queries3=114, expected_num_async_tasks3=17, ) @@ -287,11 +287,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self): testuser.usercontactinfo.save() self._import_reimport_performance( - expected_num_queries1=312, + expected_num_queries1=313, expected_num_async_tasks1=6, - expected_num_queries2=287, + expected_num_queries2=239, expected_num_async_tasks2=17, - expected_num_queries3=176, + expected_num_queries3=121, expected_num_async_tasks3=16, ) @@ -310,11 +310,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr self.system_settings(enable_product_grade=True) self._import_reimport_performance( - expected_num_queries1=314, + expected_num_queries1=315, expected_num_async_tasks1=8, - expected_num_queries2=289, + expected_num_queries2=241, expected_num_async_tasks2=19, - expected_num_queries3=178, + expected_num_queries3=123, expected_num_async_tasks3=18, ) From 71d2e9192528704675237ca2ef172126e8386dce Mon Sep 17 00:00:00 2001 From: Valentijn Scholten Date: Wed, 17 Dec 2025 15:13:56 +0100 Subject: [PATCH 26/26] update counts and script --- scripts/update_performance_test_counts.py | 421 ++++++++++++++++++---- unittests/test_importers_performance.py | 5 +- 2 files changed, 348 insertions(+), 78 deletions(-) diff --git a/scripts/update_performance_test_counts.py b/scripts/update_performance_test_counts.py index 867c01bb280..f7cfaae2859 100644 --- a/scripts/update_performance_test_counts.py +++ b/scripts/update_performance_test_counts.py @@ -56,8 +56,62 @@ def __repr__(self): ) +def extract_test_methods(test_class: str) -> list[str]: + """Extract all test method names from the test class.""" + if not TEST_FILE.exists(): + msg = f"Test file not found: {TEST_FILE}" + raise FileNotFoundError(msg) + + content = TEST_FILE.read_text() + + # Find the test class definition + class_pattern = re.compile( + rf"class {re.escape(test_class)}.*?(?=class |\Z)", + re.DOTALL, + ) + class_match = class_pattern.search(content) + if not class_match: + return [] + + class_content = class_match.group(0) + + # Find all test methods in this class + test_method_pattern = re.compile(r"def (test_\w+)\(") + return test_method_pattern.findall(class_content) + + +def run_test_method(test_class: str, test_method: str) -> tuple[str, int]: + """Run a specific test method and return the output and return code.""" + print(f"Running {test_class}.{test_method}...") + cmd = [ + "./run-unittest.sh", + "--test-case", + f"unittests.test_importers_performance.{test_class}.{test_method}", + ] + + # Run with real-time output streaming + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + cwd=Path(__file__).parent.parent, + ) + + output_lines = [] + for line in process.stdout: + print(line, end="") # Print in real-time + output_lines.append(line) + + process.wait() + output = "".join(output_lines) + + return output, process.returncode + + def run_tests(test_class: str) -> tuple[str, int]: - """Run the performance tests and return the output and return code.""" + """Run all tests in a test class and return the output and return code.""" print(f"Running tests for {test_class}...") cmd = [ "./run-unittest.sh", @@ -136,47 +190,103 @@ def parse_test_output(output: str) -> list[TestCount]: """Parse test output to extract actual vs expected counts.""" counts = [] - # Pattern to match AssertionError with query counts - # Format: AssertionError: 118 != 120 : 118 queries executed, 120 expected - assertion_pattern = re.compile( - r"AssertionError: (\d+) != (\d+) : (\d+) queries executed, (\d+) expected", + # Debug: Save a sample of the output to help diagnose parsing issues + if "FAIL:" in output: + # Extract failure sections for debugging + fail_sections = [] + lines = output.split("\n") + in_fail_section = False + fail_section = [] + for line in lines: + if "FAIL:" in line: + if fail_section: + fail_sections.append("\n".join(fail_section)) + fail_section = [line] + in_fail_section = True + elif in_fail_section: + fail_section.append(line) + # Stop collecting after AssertionError line or after 5 more lines + if "AssertionError:" in line or len(fail_section) > 6: + fail_sections.append("\n".join(fail_section)) + fail_section = [] + in_fail_section = False + if fail_section: + fail_sections.append("\n".join(fail_section)) + + if fail_sections: + print(f"\n🔍 Found {len(fail_sections)} failure section(s) in output") + + # The test output format is: + # FAIL: test_name (step='import1', metric='queries') + # AssertionError: 118 != 120 : 118 queries executed, 120 expected + # OR for async tasks: + # FAIL: test_name (step='import1', metric='async_tasks') + # AssertionError: 7 != 8 : 7 async tasks executed, 8 expected + + # Pattern to match the full failure block: + # FAIL: test_name (full.path.to.test) (step='...', metric='...') + # AssertionError: actual != expected : actual ... executed, expected expected + # The test name may include the full path in parentheses, so we extract just the method name + failure_pattern = re.compile( + r"FAIL:\s+(test_\w+)\s+\([^)]+\)\s+\(step=['\"](\w+)['\"],\s*metric=['\"](\w+)['\"]\)\s*\n" + r".*?AssertionError:\s+(\d+)\s+!=\s+(\d+)\s+:\s+\d+\s+(?:queries|async tasks?)\s+executed,\s+\d+\s+expected", + re.MULTILINE | re.DOTALL, ) - # Pattern to match test name and step - # Format: (step='reimport2', metric='queries') - step_pattern = re.compile(r"step='(\w+)', metric='(\w+)'") - - # Pattern to match test name - # Format: FAIL: test_import_reimport_reimport_performance_async - test_pattern = re.compile(r"FAIL: (test_\w+)") - - lines = output.split("\n") - current_test = None - current_step = None - current_metric = None - - for line in lines: - # Match test name - test_match = test_pattern.search(line) - if test_match: - current_test = test_match.group(1) - - # Match step and metric - step_match = step_pattern.search(line) - if step_match: - current_step = step_match.group(1) - current_metric = step_match.group(2) - - # Match assertion error - assertion_match = assertion_pattern.search(line) - if assertion_match and current_test and current_step and current_metric: - actual = int(assertion_match.group(1)) - expected = int(assertion_match.group(2)) - count = TestCount(current_test, current_step, current_metric) - count.actual = actual - count.expected = expected - count.difference = expected - actual - counts.append(count) + for match in failure_pattern.finditer(output): + test_name = match.group(1) + step = match.group(2) + metric = match.group(3) + actual = int(match.group(4)) + expected = int(match.group(5)) + + count = TestCount(test_name, step, metric) + count.actual = actual + count.expected = expected + count.difference = expected - actual + counts.append(count) + + # Also try a simpler pattern in case the format is slightly different + if not counts: + # Look for lines with step/metric followed by AssertionError on nearby lines + lines = output.split("\n") + i = 0 + while i < len(lines): + line = lines[i] + + # Look for FAIL: test_name (may include full path in parentheses) + # Format: FAIL: test_name (full.path) (step='...', metric='...') + fail_match = re.search(r"FAIL:\s+(test_\w+)\s+\([^)]+\)\s+\(step=['\"](\w+)['\"],\s*metric=['\"](\w+)['\"]\)", line) + if fail_match: + test_name = fail_match.group(1) + step = fail_match.group(2) + metric = fail_match.group(3) + # Look ahead for AssertionError + for j in range(i, min(i + 15, len(lines))): + assertion_match = re.search( + r"AssertionError:\s+(\d+)\s+!=\s+(\d+)\s+:\s+\d+\s+(?:queries|async tasks?)\s+executed,\s+\d+\s+expected", + lines[j], + ) + + if assertion_match: + actual = int(assertion_match.group(1)) + expected = int(assertion_match.group(2)) + + count = TestCount(test_name, step, metric) + count.actual = actual + count.expected = expected + count.difference = expected - actual + counts.append(count) + break + i += 1 + + if counts: + print(f"\n📊 Parsed {len(counts)} count mismatch(es) from test output:") + for count in counts: + print(f" {count.test_name} - {count.step} {count.metric}: {count.actual} != {count.expected}") + elif "FAIL:" in output: + print("\n⚠️ WARNING: Found FAIL in output but couldn't parse any count mismatches!") + print("This might indicate a parsing issue. Check the output above.") return counts @@ -276,32 +386,137 @@ def update_test_file(counts: list[TestCount]): step_metric = f"{count.step}_{count.metric}" updates[count.test_name][step_metric] = count.actual + # Map step_metric to parameter name for different methods + param_map_import_reimport = { + "import1_queries": "expected_num_queries1", + "import1_async_tasks": "expected_num_async_tasks1", + "reimport1_queries": "expected_num_queries2", + "reimport1_async_tasks": "expected_num_async_tasks2", + "reimport2_queries": "expected_num_queries3", + "reimport2_async_tasks": "expected_num_async_tasks3", + } + param_map_deduplication = { + "first_import_queries": "expected_num_queries1", + "first_import_async_tasks": "expected_num_async_tasks1", + "second_import_queries": "expected_num_queries2", + "second_import_async_tasks": "expected_num_async_tasks2", + } + # Update each test method for test_name, test_updates in updates.items(): - # Find the test method and update expected_num_queries/async_tasks values - # Map step_metric to parameter name - param_map = { - "import1_queries": "expected_num_queries1", - "import1_async_tasks": "expected_num_async_tasks1", - "reimport1_queries": "expected_num_queries2", - "reimport1_async_tasks": "expected_num_async_tasks2", - "reimport2_queries": "expected_num_queries3", - "reimport2_async_tasks": "expected_num_async_tasks3", - } + print(f" Updating {test_name}...") + # Find the test method boundaries + test_method_pattern = re.compile( + rf"(def {re.escape(test_name)}\([^)]*\):.*?)(?=def test_|\Z)", + re.DOTALL, + ) + test_match = test_method_pattern.search(content) + if not test_match: + print(f"⚠️ Warning: Could not find test method {test_name}") + continue + + test_method_content = test_match.group(1) + test_method_start = test_match.start() + test_method_end = test_match.end() + + # Try to find _import_reimport_performance call first + perf_call_pattern_import_reimport = re.compile( + r"(self\._import_reimport_performance\s*\(\s*)" + r"expected_num_queries1\s*=\s*(\d+)\s*,\s*" + r"expected_num_async_tasks1\s*=\s*(\d+)\s*,\s*" + r"expected_num_queries2\s*=\s*(\d+)\s*,\s*" + r"expected_num_async_tasks2\s*=\s*(\d+)\s*,\s*" + r"expected_num_queries3\s*=\s*(\d+)\s*,\s*" + r"expected_num_async_tasks3\s*=\s*(\d+)\s*," + r"(\s*\))", + re.DOTALL, + ) - for step_metric, new_value in test_updates.items(): - param_name = param_map.get(step_metric) - if param_name: - # Pattern to match: expected_num_queries3=120, (with flexible whitespace) - pattern = re.compile( - rf"(def {re.escape(test_name)}\([^)]*\):.*?{re.escape(param_name)}\s*=\s*)(\d+)(\s*,)", - re.DOTALL, - ) + # Try to find _deduplication_performance call + perf_call_pattern_deduplication = re.compile( + r"(self\._deduplication_performance\s*\(\s*)" + r"expected_num_queries1\s*=\s*(\d+)\s*,\s*" + r"expected_num_async_tasks1\s*=\s*(\d+)\s*,\s*" + r"expected_num_queries2\s*=\s*(\d+)\s*,\s*" + r"expected_num_async_tasks2\s*=\s*(\d+)\s*," + r"(\s*\))", + re.DOTALL, + ) - def replace(match): - return f"{match.group(1)}{new_value}{match.group(3)}" # noqa: B023 + perf_match = perf_call_pattern_import_reimport.search(test_method_content) + method_type = "import_reimport" + param_map = param_map_import_reimport + param_order = [ + "import1_queries", + "import1_async_tasks", + "reimport1_queries", + "reimport1_async_tasks", + "reimport2_queries", + "reimport2_async_tasks", + ] + + if not perf_match: + perf_match = perf_call_pattern_deduplication.search(test_method_content) + if perf_match: + method_type = "deduplication" + param_map = param_map_deduplication + param_order = [ + "first_import_queries", + "first_import_async_tasks", + "second_import_queries", + "second_import_async_tasks", + ] + else: + print(f"⚠️ Warning: Could not find _import_reimport_performance or _deduplication_performance call in {test_name}") + continue + + # Get the indentation from the original call (first line after opening paren) + call_lines = test_method_content[perf_match.start():perf_match.end()].split("\n") + indent = "" + for line in call_lines: + if "expected_num_queries1" in line: + # Extract indentation (spaces before the parameter) + indent_match = re.match(r"(\s*)expected_num_queries1", line) + if indent_match: + indent = indent_match.group(1) + break + + # If we couldn't find indentation, use a default + if not indent: + indent = " " # 12 spaces default + + replacement_parts = [perf_match.group(1)] # Opening: "self._import_reimport_performance(" + updated_params = [] + for i, step_metric in enumerate(param_order): + param_name = param_map[step_metric] + old_value = int(perf_match.group(i + 2)) # +2 because group 1 is the opening + if step_metric in test_updates: + new_value = test_updates[step_metric] + if old_value != new_value: + updated_params.append(f"{param_name}: {old_value} → {new_value}") + else: + # Keep the existing value + new_value = old_value + + replacement_parts.append(f"{indent}{param_name}={new_value},") + + # Closing parenthesis - group number depends on method type + closing_group = 8 if method_type == "import_reimport" else 6 + replacement_parts.append(perf_match.group(closing_group)) # Closing parenthesis + replacement = "\n".join(replacement_parts) + + if updated_params: + print(f" Updated: {', '.join(updated_params)}") + + # Replace the method call within the test method content + updated_method_content = ( + test_method_content[: perf_match.start()] + + replacement + + test_method_content[perf_match.end() :] + ) - content = pattern.sub(replace, content) + # Replace the entire test method in the original content + content = content[:test_method_start] + updated_method_content + content[test_method_end:] # Write back to file TEST_FILE.write_text(content) @@ -364,14 +579,31 @@ def main(): if args.report_only: # Step 1: Run tests and generate report - output, return_code = run_tests(args.test_class) - success, error_msg = check_test_execution_success(output, return_code) - if not success: - print(f"\n❌ Test execution failed: {error_msg}") + # Run each test method individually + test_methods = extract_test_methods(args.test_class) + if not test_methods: + print(f"⚠️ No test methods found in {args.test_class}") sys.exit(1) - counts = parse_test_output(output) + + print(f"\nFound {len(test_methods)} test method(s) in {args.test_class}") + print("=" * 80) + + all_counts = [] + for test_method in test_methods: + print(f"\n{'=' * 80}") + output, return_code = run_test_method(args.test_class, test_method) + success, error_msg = check_test_execution_success(output, return_code) + if not success: + print(f"\n⚠️ Test execution failed for {test_method}: {error_msg}") + print("Skipping this test method...") + continue + + counts = parse_test_output(output) + if counts: + all_counts.extend(counts) + expected_counts = extract_expected_counts_from_file(args.test_class) - generate_report(counts, expected_counts) + generate_report(all_counts, expected_counts) elif args.verify: # Step 3: Verify @@ -380,18 +612,55 @@ def main(): else: # Default: Update the file (--update is the default behavior) - output, return_code = run_tests(args.test_class) - success, error_msg = check_test_execution_success(output, return_code) - if not success: - print(f"\n❌ Test execution failed: {error_msg}") - print("Cannot update counts because tests did not run successfully.") + # Run each test method individually + test_methods = extract_test_methods(args.test_class) + if not test_methods: + print(f"⚠️ No test methods found in {args.test_class}") sys.exit(1) - counts = parse_test_output(output) - if counts: - update_test_file(counts) + + print(f"\nFound {len(test_methods)} test method(s) in {args.test_class}") + print("=" * 80) + + all_counts = [] + for test_method in test_methods: + print(f"\n{'=' * 80}") + output, return_code = run_test_method(args.test_class, test_method) + success, error_msg = check_test_execution_success(output, return_code) + if not success: + print(f"\n⚠️ Test execution failed for {test_method}: {error_msg}") + print("Skipping this test method...") + continue + + counts = parse_test_output(output) + + # Check if test actually passed + test_passed = "OK" in output or ("Ran" in output and "FAILED" not in output and return_code == 0) + + if counts: + all_counts.extend(counts) + # Update immediately after each test + update_test_file(counts) + print(f"⚠️ {test_method}: Found {len(counts)} count mismatch(es) - updated file") + elif test_passed: + print(f"✅ {test_method}: Test passed, all counts match") + elif return_code != 0: + # Test might have failed for other reasons + print(f"⚠️ {test_method}: Test failed (exit code {return_code}) but no count mismatches parsed") + print(" This might indicate a parsing issue or a different type of failure") + # Show a snippet of the output to help debug + fail_lines = [line for line in output.split("\n") if "FAIL" in line or "Error" in line or "Exception" in line] + if fail_lines: + print(" Relevant error lines:") + for line in fail_lines[:5]: + print(f" {line}") + + if all_counts: + print(f"\n{'=' * 80}") + print(f"✅ Updated {len(all_counts)} count(s) across {len({c.test_name for c in all_counts})} test(s)") print("\nNext step: Run --verify to ensure all tests pass") else: - print("\nNo differences found. All tests are already up to date.") + print(f"\n{'=' * 80}") + print("\n✅ No differences found. All tests are already up to date.") if __name__ == "__main__": diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py index b2dc1c07192..1e7b05d8fe5 100644 --- a/unittests/test_importers_performance.py +++ b/unittests/test_importers_performance.py @@ -454,8 +454,9 @@ def test_deduplication_performance_pghistory_no_async(self): testuser.usercontactinfo.save() self._deduplication_performance( - expected_num_queries1=281, + expected_num_queries1=282, expected_num_async_tasks1=7, - expected_num_queries2=245, + expected_num_queries2=246, expected_num_async_tasks2=7, + )