From 69549dfcb7f8ff11bcf730922be03720aa33d24e Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Fri, 12 Dec 2025 17:03:47 +0100
Subject: [PATCH 01/26] fix logger NoneType

---
 dojo/models.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dojo/models.py b/dojo/models.py
index 282a8c4d667..e547940dd0e 100644
--- a/dojo/models.py
+++ b/dojo/models.py
@@ -3505,11 +3505,12 @@ def set_hash_code(self, dedupe_option):
         # Finding.save is called once from serializers.py with dedupe_option=False because the finding is not ready yet, for example the endpoints are not built
         # It is then called a second time with dedupe_option defaulted to true; now we can compute the hash_code and run the deduplication
         elif dedupe_option:
+            finding_id = self.id if self.id is not None else "unsaved"
             if self.hash_code is not None:
-                deduplicationLogger.debug("Hash_code already computed for finding %i", self.id)
+                deduplicationLogger.debug("Hash_code already computed for finding: %s", finding_id)
             else:
                 self.hash_code = self.compute_hash_code()
-                deduplicationLogger.debug("Hash_code computed for finding %i: %s", self.id, self.hash_code)
+                deduplicationLogger.debug("Hash_code computed for finding: %s: %s", finding_id, self.hash_code)
 
 
 class FindingAdmin(admin.ModelAdmin):

From 0ab7a50fd93b192a59f798913401f6ca672c67a8 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sat, 13 Dec 2025 12:18:11 +0100
Subject: [PATCH 02/26] reimport: use batch matching

---
 dojo/finding/deduplication.py              | 175 ++++++++++--
 dojo/importers/default_reimporter.py       | 287 ++++++++++++++-----
 dojo/management/commands/list_top_tests.py | 111 ++++++++
 dojo/settings/settings.dist.py             |   3 +
 unittests/test_import_reimport.py          | 317 ++++++++++++++++++++-
 unittests/test_importers_performance.py    | 130 ++++++---
 6 files changed, 872 insertions(+), 151 deletions(-)
 create mode 100644 dojo/management/commands/list_top_tests.py

diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py
index 7f334236dbf..b51c791c199 100644
--- a/dojo/finding/deduplication.py
+++ b/dojo/finding/deduplication.py
@@ -225,59 +225,122 @@ def are_endpoints_duplicates(new_finding, to_duplicate_finding):
     return False
 
 
-def build_dedupe_scope_queryset(test):
-    scope_on_engagement = test.engagement.deduplication_on_engagement
-    if scope_on_engagement:
-        scope_q = Q(test__engagement=test.engagement)
-    else:
-        # Product scope limited to current product, but exclude engagements that opted into engagement-scoped dedupe
-        scope_q = Q(test__engagement__product=test.engagement.product) & (
-            Q(test__engagement=test.engagement)
-            | Q(test__engagement__deduplication_on_engagement=False)
-        )
+def build_candidate_scope_queryset(test, mode="deduplication", service=None):
+    """
+    Build a queryset for candidate finding.
+
+    Args:
+        test: The test to scope from
+        mode: "deduplication" (can match across tests) or "reimport" (same test only)
+        service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
+
+    """
+    if mode == "reimport":
+        # For reimport, only filter by test. Service filtering is not needed because
+        # service is included in hash_code calculation (HASH_CODE_FIELDS_ALWAYS = ["service"]),
+        # so matching by hash_code automatically ensures correct service match.
+        queryset = Finding.objects.filter(test=test)
+    else:  # deduplication mode
+        scope_on_engagement = test.engagement.deduplication_on_engagement
+        if scope_on_engagement:
+            scope_q = Q(test__engagement=test.engagement)
+        else:
+            # Product scope limited to current product, but exclude engagements that opted into engagement-scoped dedupe
+            scope_q = Q(test__engagement__product=test.engagement.product) & (
+                Q(test__engagement=test.engagement)
+                | Q(test__engagement__deduplication_on_engagement=False)
+            )
+        queryset = Finding.objects.filter(scope_q)
 
     return (
-        Finding.objects.filter(scope_q)
+        queryset
         .select_related("test", "test__engagement", "test__test_type")
         .prefetch_related("endpoints")
     )
 
 
-def find_candidates_for_deduplication_hash(test, findings):
-    base_queryset = build_dedupe_scope_queryset(test)
+# TODO: remove this if that doesn't affect Pro
+def build_dedupe_scope_queryset(test):
+    """
+    Legacy function name for backward compatibility.
+    Use build_candidate_scope_queryset() instead.
+    """
+    return build_candidate_scope_queryset(test, mode="deduplication")
+
+
+def find_candidates_for_deduplication_hash(test, findings, mode="deduplication", service=None):
+    """
+    Find candidates by hash_code. Works for both deduplication and reimport.
+
+    Args:
+        test: The test to scope from
+        findings: List of findings to find candidates for
+        mode: "deduplication" or "reimport"
+        service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
+
+    """
+    base_queryset = build_candidate_scope_queryset(test, mode=mode, service=service)
     hash_codes = {f.hash_code for f in findings if getattr(f, "hash_code", None) is not None}
     if not hash_codes:
         return {}
-    existing_qs = (
-        base_queryset.filter(hash_code__in=hash_codes)
-        .exclude(hash_code=None)
-        .exclude(duplicate=True)
-        .order_by("id")
-    )
+
+    existing_qs = base_queryset.filter(hash_code__in=hash_codes).exclude(hash_code=None)
+    if mode == "deduplication":
+        existing_qs = existing_qs.exclude(duplicate=True)
+    existing_qs = existing_qs.order_by("id")
+
     existing_by_hash = {}
     for ef in existing_qs:
         existing_by_hash.setdefault(ef.hash_code, []).append(ef)
-    deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes")
+
+    log_msg = "for reimport" if mode == "reimport" else ""
+    deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes {log_msg}")
     return existing_by_hash
 
 
-def find_candidates_for_deduplication_unique_id(test, findings):
-    base_queryset = build_dedupe_scope_queryset(test)
+def find_candidates_for_deduplication_unique_id(test, findings, mode="deduplication", service=None):
+    """
+    Find candidates by unique_id_from_tool. Works for both deduplication and reimport.
+
+    Args:
+        test: The test to scope from
+        findings: List of findings to find candidates for
+        mode: "deduplication" or "reimport"
+        service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
+
+    """
+    base_queryset = build_candidate_scope_queryset(test, mode=mode, service=service)
     unique_ids = {f.unique_id_from_tool for f in findings if getattr(f, "unique_id_from_tool", None) is not None}
     if not unique_ids:
         return {}
-    existing_qs = base_queryset.filter(unique_id_from_tool__in=unique_ids).exclude(unique_id_from_tool=None).exclude(duplicate=True).order_by("id")
+
+    existing_qs = base_queryset.filter(unique_id_from_tool__in=unique_ids).exclude(unique_id_from_tool=None)
+    if mode == "deduplication":
+        existing_qs = existing_qs.exclude(duplicate=True)
     # unique_id_from_tool can only apply to the same test_type because it is parser dependent
-    existing_qs = existing_qs.filter(test__test_type=test.test_type)
+    existing_qs = existing_qs.filter(test__test_type=test.test_type).order_by("id")
+
     existing_by_uid = {}
     for ef in existing_qs:
         existing_by_uid.setdefault(ef.unique_id_from_tool, []).append(ef)
-    deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs")
+
+    log_msg = "for reimport" if mode == "reimport" else ""
+    deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs {log_msg}")
     return existing_by_uid
 
 
-def find_candidates_for_deduplication_uid_or_hash(test, findings):
-    base_queryset = build_dedupe_scope_queryset(test)
+def find_candidates_for_deduplication_uid_or_hash(test, findings, mode="deduplication", service=None):
+    """
+    Find candidates by unique_id_from_tool or hash_code. Works for both deduplication and reimport.
+
+    Args:
+        test: The test to scope from
+        findings: List of findings to find candidates for
+        mode: "deduplication" or "reimport"
+        service: Optional service filter (for deduplication mode, not used for reimport since service is in hash)
+
+    """
+    base_queryset = build_candidate_scope_queryset(test, mode=mode, service=service)
     hash_codes = {f.hash_code for f in findings if getattr(f, "hash_code", None) is not None}
     unique_ids = {f.unique_id_from_tool for f in findings if getattr(f, "unique_id_from_tool", None) is not None}
     if not hash_codes and not unique_ids:
@@ -291,7 +354,11 @@ def find_candidates_for_deduplication_uid_or_hash(test, findings):
         uid_q = Q(unique_id_from_tool__isnull=False, unique_id_from_tool__in=unique_ids) & Q(test__test_type=test.test_type)
         cond |= uid_q
 
-    existing_qs = base_queryset.filter(cond).exclude(duplicate=True).order_by("id")
+    existing_qs = base_queryset.filter(cond)
+    if mode == "deduplication":
+        # reimport matching will match against duplicates, import/deduplication doesn't.
+        existing_qs = existing_qs.exclude(duplicate=True)
+    existing_qs = existing_qs.order_by("id")
 
     existing_by_hash = {}
     existing_by_uid = {}
@@ -300,8 +367,10 @@ def find_candidates_for_deduplication_uid_or_hash(test, findings):
             existing_by_hash.setdefault(ef.hash_code, []).append(ef)
         if ef.unique_id_from_tool is not None:
             existing_by_uid.setdefault(ef.unique_id_from_tool, []).append(ef)
-    deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs")
-    deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes")
+
+    log_msg = "for reimport" if mode == "reimport" else ""
+    deduplicationLogger.debug(f"Found {len(existing_by_uid)} existing findings by unique IDs {log_msg}")
+    deduplicationLogger.debug(f"Found {len(existing_by_hash)} existing findings by hash codes {log_msg}")
     return existing_by_uid, existing_by_hash
 
 
@@ -328,6 +397,52 @@ def find_candidates_for_deduplication_legacy(test, findings):
     return by_title, by_cwe
 
 
+# TODO: should we align this with deduplication?
+def find_candidates_for_reimport_legacy(test, findings, service=None):
+    """
+    Find all existing findings in the test that match any of the given findings by title and severity.
+    Used for batch reimport to avoid 1+N query problem.
+    Legacy reimport matches by title (case-insensitive), severity, and numerical_severity.
+    Note: This function is kept separate because legacy reimport has fundamentally different matching logic
+    than legacy deduplication (title+severity vs title+CWE).
+    Note: service parameter is kept for backward compatibility but not used since service is in hash_code.
+    """
+    base_queryset = build_candidate_scope_queryset(test, mode="reimport", service=None)
+
+    # Collect all unique title/severity combinations
+    title_severity_pairs = set()
+    for finding in findings:
+        if finding.title:
+            title_severity_pairs.add((
+                finding.title.lower(),  # Case-insensitive matching
+                finding.severity,
+                Finding.get_numerical_severity(finding.severity),
+            ))
+
+    if not title_severity_pairs:
+        return {}
+
+    # Build query to find all matching findings
+    conditions = Q()
+    for title_lower, severity, numerical_severity in title_severity_pairs:
+        conditions |= (
+            Q(title__iexact=title_lower) &
+            Q(severity=severity) &
+            Q(numerical_severity=numerical_severity)
+        )
+
+    existing_qs = base_queryset.filter(conditions).order_by("id")
+
+    # Build dictionary keyed by (title_lower, severity) for quick lookup
+    existing_by_key = {}
+    for ef in existing_qs:
+        key = (ef.title.lower(), ef.severity)
+        existing_by_key.setdefault(key, []).append(ef)
+
+    deduplicationLogger.debug(f"Found {sum(len(v) for v in existing_by_key.values())} existing findings by legacy matching for reimport")
+    return existing_by_key
+
+
 def _is_candidate_older(new_finding, candidate):
     # Ensure the newer finding is marked as duplicate of the older finding
     is_older = candidate.id < new_finding.id
diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
index 10b3ac7148a..2e426ba2fda 100644
--- a/dojo/importers/default_reimporter.py
+++ b/dojo/importers/default_reimporter.py
@@ -1,5 +1,6 @@
 import logging
 
+from django.conf import settings
 from django.core.files.uploadedfile import TemporaryUploadedFile
 from django.core.serializers import serialize
 from django.db.models.query_utils import Q
@@ -7,6 +8,12 @@
 import dojo.finding.helper as finding_helper
 import dojo.jira_link.helper as jira_helper
 from dojo.decorators import we_want_async
+from dojo.finding.deduplication import (
+    find_candidates_for_deduplication_hash,
+    find_candidates_for_deduplication_uid_or_hash,
+    find_candidates_for_deduplication_unique_id,
+    find_candidates_for_reimport_legacy,
+)
 from dojo.importers.base_importer import BaseImporter, Parser
 from dojo.importers.options import ImporterOptions
 from dojo.models import (
@@ -204,86 +211,141 @@ def process_findings(
             cleaned_findings.append(sanitized)
 
         batch_finding_ids: list[int] = []
-        batch_max_size = 1000
-
-        for idx, unsaved_finding in enumerate(cleaned_findings):
-            is_final = idx == len(cleaned_findings) - 1
-            # Some parsers provide "mitigated" field but do not set timezone (because they are probably not available in the report)
-            # Finding.mitigated is DateTimeField and it requires timezone
-            if unsaved_finding.mitigated and not unsaved_finding.mitigated.tzinfo:
-                unsaved_finding.mitigated = unsaved_finding.mitigated.replace(tzinfo=self.now.tzinfo)
-            # Override the test if needed
-            if not hasattr(unsaved_finding, "test"):
-                unsaved_finding.test = self.test
-            # Set the service supplied at import time
-            if self.service is not None:
-                unsaved_finding.service = self.service
-            # Clean any endpoints that are on the finding
-            self.endpoint_manager.clean_unsaved_endpoints(unsaved_finding.unsaved_endpoints)
-            # Calculate the hash code to be used to identify duplicates
-            unsaved_finding.hash_code = self.calculate_unsaved_finding_hash_code(unsaved_finding)
-            deduplicationLogger.debug(f"unsaved finding's hash_code: {unsaved_finding.hash_code}")
-            # Match any findings to this new one coming in
-            matched_findings = self.match_new_finding_to_existing_finding(unsaved_finding)
-            deduplicationLogger.debug(f"found {len(matched_findings)} findings matching with current new finding")
-            # Determine how to proceed based on whether matches were found or not
-            if matched_findings:
-                existing_finding = matched_findings[0]
-                finding, force_continue = self.process_matched_finding(
+        # Batch size for deduplication/post-processing (only new findings)
+        dedupe_batch_max_size = getattr(settings, "IMPORT_REIMPORT_DEDUPE_BATCH_SIZE", 1000)
+        # Batch size for candidate matching (all findings, before matching)
+        match_batch_max_size = getattr(settings, "IMPORT_REIMPORT_MATCH_BATCH_SIZE", 1000)
+
+        # Process findings in batches to enable batch candidate fetching
+        # This avoids the 1+N query problem by fetching all candidates for a batch at once
+        for batch_start in range(0, len(cleaned_findings), match_batch_max_size):
+            batch_end = min(batch_start + match_batch_max_size, len(cleaned_findings))
+            batch_findings = cleaned_findings[batch_start:batch_end]
+            is_final_batch = batch_end == len(cleaned_findings)
+
+            logger.debug(f"Processing reimport batch {batch_start}-{batch_end} of {len(cleaned_findings)} findings")
+
+            # Prepare findings in batch: set test, service, calculate hash codes
+            for unsaved_finding in batch_findings:
+                # Some parsers provide "mitigated" field but do not set timezone (because they are probably not available in the report)
+                # Finding.mitigated is DateTimeField and it requires timezone
+                if unsaved_finding.mitigated and not unsaved_finding.mitigated.tzinfo:
+                    unsaved_finding.mitigated = unsaved_finding.mitigated.replace(tzinfo=self.now.tzinfo)
+                # Override the test if needed
+                if not hasattr(unsaved_finding, "test"):
+                    unsaved_finding.test = self.test
+                # Set the service supplied at import time
+                if self.service is not None:
+                    unsaved_finding.service = self.service
+                # Clean any endpoints that are on the finding
+                self.endpoint_manager.clean_unsaved_endpoints(unsaved_finding.unsaved_endpoints)
+                # Calculate the hash code to be used to identify duplicates
+                unsaved_finding.hash_code = self.calculate_unsaved_finding_hash_code(unsaved_finding)
+                deduplicationLogger.debug(f"unsaved finding's hash_code: {unsaved_finding.hash_code}")
+
+            # Fetch all candidates for this batch at once (batch candidate finding)
+            candidates_by_hash = None
+            candidates_by_uid = None
+            candidates_by_key = None
+
+            if self.deduplication_algorithm == "hash_code":
+                candidates_by_hash = find_candidates_for_deduplication_hash(
+                    self.test, batch_findings, mode="reimport",
+                )
+            elif self.deduplication_algorithm == "unique_id_from_tool":
+                candidates_by_uid = find_candidates_for_deduplication_unique_id(
+                    self.test, batch_findings, mode="reimport",
+                )
+            elif self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
+                candidates_by_uid, candidates_by_hash = find_candidates_for_deduplication_uid_or_hash(
+                    self.test, batch_findings, mode="reimport",
+                )
+            elif self.deduplication_algorithm == "legacy":
+                candidates_by_key = find_candidates_for_reimport_legacy(self.test, batch_findings)
+
+            # Process each finding in the batch using pre-fetched candidates
+            for idx, unsaved_finding in enumerate(batch_findings):
+                is_final = is_final_batch and idx == len(batch_findings) - 1
+                # Match any findings to this new one coming in using pre-fetched candidates
+                matched_findings = self.match_finding_for_reimport(
                     unsaved_finding,
-                    existing_finding,
+                    candidates_by_hash=candidates_by_hash,
+                    candidates_by_uid=candidates_by_uid,
+                    candidates_by_key=candidates_by_key,
                 )
-                # Determine if we should skip the rest of the loop
-                if force_continue:
-                    continue
-                # Update endpoints on the existing finding with those on the new finding
-                if finding.dynamic_finding:
-                    logger.debug(
-                        "Re-import found an existing dynamic finding for this new "
-                        "finding. Checking the status of endpoints",
-                    )
-                    self.endpoint_manager.update_endpoint_status(
-                        existing_finding,
+                deduplicationLogger.debug(f"found {len(matched_findings)} findings matching with current new finding")
+                # Determine how to proceed based on whether matches were found or not
+                if matched_findings:
+                    existing_finding = matched_findings[0]
+                    finding, force_continue = self.process_matched_finding(
                         unsaved_finding,
-                        self.user,
+                        existing_finding,
                     )
-            else:
-                finding = self.process_finding_that_was_not_matched(unsaved_finding)
-            # This condition __appears__ to always be true, but am afraid to remove it
-            if finding:
-                # Process the rest of the items on the finding
-                finding = self.finding_post_processing(
-                    finding,
-                    unsaved_finding,
-                )
-                # all data is already saved on the finding, we only need to trigger post processing in batches
-                push_to_jira = self.push_to_jira and (not self.findings_groups_enabled or not self.group_by)
-                batch_finding_ids.append(finding.id)
-
-                # If batch is full or we're at the end, dispatch one batched task
-                if len(batch_finding_ids) >= batch_max_size or is_final:
-                    finding_ids_batch = list(batch_finding_ids)
-                    batch_finding_ids.clear()
-                    if we_want_async(async_user=self.user):
-                        finding_helper.post_process_findings_batch_signature(
-                            finding_ids_batch,
-                            dedupe_option=True,
-                            rules_option=True,
-                            product_grading_option=True,
-                            issue_updater_option=True,
-                            push_to_jira=push_to_jira,
-                        )()
-                    else:
-                        finding_helper.post_process_findings_batch(
-                            finding_ids_batch,
-                            dedupe_option=True,
-                            rules_option=True,
-                            product_grading_option=True,
-                            issue_updater_option=True,
-                            push_to_jira=push_to_jira,
+                    # Determine if we should skip the rest of the loop
+                    if force_continue:
+                        continue
+                    # Update endpoints on the existing finding with those on the new finding
+                    if finding.dynamic_finding:
+                        logger.debug(
+                            "Re-import found an existing dynamic finding for this new "
+                            "finding. Checking the status of endpoints",
                         )
-
-            # No chord: tasks are dispatched immediately above per batch
+                        self.endpoint_manager.update_endpoint_status(
+                            existing_finding,
+                            unsaved_finding,
+                            self.user,
+                        )
+                else:
+                    finding = self.process_finding_that_was_not_matched(unsaved_finding)
+                # This condition __appears__ to always be true, but am afraid to remove it
+                if finding:
+                    # Process the rest of the items on the finding
+                    finding = self.finding_post_processing(
+                        finding,
+                        unsaved_finding,
+                    )
+                    # all data is already saved on the finding, we only need to trigger post processing in batches
+                    push_to_jira = self.push_to_jira and (not self.findings_groups_enabled or not self.group_by)
+                    batch_finding_ids.append(finding.id)
+
+                    # Post-processing batches (deduplication, rules, etc.) are separate from matching batches.
+                    # These batches only contain "new" findings that were saved (not matched to existing findings).
+                    # In reimport scenarios, typically most findings match existing ones, so only a small fraction
+                    # of findings in each matching batch become new findings that need deduplication.
+                    #
+                    # We accumulate finding IDs across matching batches rather than dispatching at the end of each
+                    # matching batch. This ensures deduplication batches stay close to the intended batch size
+                    # (e.g., 1000 findings) for optimal bulk operation efficiency, even when only ~10% of findings
+                    # in matching batches are new. If we dispatched at the end of each matching batch, we would
+                    # end up with many small deduplication batches (e.g., ~100 findings each), reducing efficiency.
+                    #
+                    # The two batch types serve different purposes:
+                    # - Matching batches: optimize candidate fetching (solve 1+N query problem)
+                    # - Deduplication batches: optimize bulk operations (larger batches = fewer queries)
+                    # They don't need to be aligned since they optimize different operations.
+                    if len(batch_finding_ids) >= dedupe_batch_max_size or is_final:
+                        finding_ids_batch = list(batch_finding_ids)
+                        batch_finding_ids.clear()
+                        if we_want_async(async_user=self.user):
+                            finding_helper.post_process_findings_batch_signature(
+                                finding_ids_batch,
+                                dedupe_option=True,
+                                rules_option=True,
+                                product_grading_option=True,
+                                issue_updater_option=True,
+                                push_to_jira=push_to_jira,
+                            )()
+                        else:
+                            finding_helper.post_process_findings_batch(
+                                finding_ids_batch,
+                                dedupe_option=True,
+                                rules_option=True,
+                                product_grading_option=True,
+                                issue_updater_option=True,
+                                push_to_jira=push_to_jira,
+                            )
+
+        # No chord: tasks are dispatched immediately above per batch
 
         self.to_mitigate = (set(self.original_items) - set(self.reactivated_items) - set(self.unchanged_items))
         # due to #3958 we can have duplicates inside the same report
@@ -418,6 +480,81 @@ def match_new_finding_to_existing_finding(
         logger.error(f'Internal error: unexpected deduplication_algorithm: "{self.deduplication_algorithm}"')
         return None
 
+    def match_finding_for_reimport(
+        self,
+        unsaved_finding: Finding,
+        candidates_by_hash: dict | None = None,
+        candidates_by_uid: dict | None = None,
+        candidates_by_key: dict | None = None,
+    ) -> list[Finding]:
+        """
+        Matches a single new finding to existing findings using pre-fetched candidate dictionaries.
+        This avoids individual database queries by using batch-fetched candidates.
+
+        Args:
+            unsaved_finding: The finding to match
+            candidates_by_hash: Dictionary mapping hash_code to list of findings (for hash_code algorithm)
+            candidates_by_uid: Dictionary mapping unique_id_from_tool to list of findings (for unique_id algorithms)
+            candidates_by_key: Dictionary mapping (title_lower, severity) to list of findings (for legacy algorithm)
+
+        Returns:
+            List of matching findings, ordered by id
+
+        """
+        deduplicationLogger.debug("matching finding for reimport using algorithm: %s", self.deduplication_algorithm)
+
+        if self.deduplication_algorithm == "hash_code":
+            if candidates_by_hash is None:
+                # Fallback to individual query if candidates not provided
+                return self.match_new_finding_to_existing_finding(unsaved_finding)
+            if unsaved_finding.hash_code is None:
+                return []
+            matches = candidates_by_hash.get(unsaved_finding.hash_code, [])
+            return sorted(matches, key=lambda f: f.id)
+
+        if self.deduplication_algorithm == "unique_id_from_tool":
+            if candidates_by_uid is None:
+                # Fallback to individual query if candidates not provided
+                return self.match_new_finding_to_existing_finding(unsaved_finding)
+            if unsaved_finding.unique_id_from_tool is None:
+                return []
+            matches = candidates_by_uid.get(unsaved_finding.unique_id_from_tool, [])
+            return sorted(matches, key=lambda f: f.id)
+
+        if self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
+            if candidates_by_hash is None or candidates_by_uid is None:
+                # Fallback to individual query if candidates not provided
+                return self.match_new_finding_to_existing_finding(unsaved_finding)
+
+            # Collect matches from both hash_code and unique_id_from_tool
+            matches_by_id = {}
+
+            if unsaved_finding.hash_code is not None:
+                hash_matches = candidates_by_hash.get(unsaved_finding.hash_code, [])
+                for match in hash_matches:
+                    matches_by_id[match.id] = match
+
+            if unsaved_finding.unique_id_from_tool is not None:
+                uid_matches = candidates_by_uid.get(unsaved_finding.unique_id_from_tool, [])
+                for match in uid_matches:
+                    matches_by_id[match.id] = match
+
+            matches = list(matches_by_id.values())
+            return sorted(matches, key=lambda f: f.id)
+
+        if self.deduplication_algorithm == "legacy":
+            if candidates_by_key is None:
+                # Fallback to individual query if candidates not provided
+                return self.match_new_finding_to_existing_finding(unsaved_finding)
+            if not unsaved_finding.title:
+                return []
+            key = (unsaved_finding.title.lower(), unsaved_finding.severity)
+            matches = candidates_by_key.get(key, [])
+            return sorted(matches, key=lambda f: f.id)
+
+        logger.error(f'Internal error: unexpected deduplication_algorithm: "{self.deduplication_algorithm}"')
+        return []
+
     def process_matched_finding(
         self,
         unsaved_finding: Finding,
diff --git a/dojo/management/commands/list_top_tests.py b/dojo/management/commands/list_top_tests.py
new file mode 100644
index 00000000000..839a1dc03fd
--- /dev/null
+++ b/dojo/management/commands/list_top_tests.py
@@ -0,0 +1,111 @@
+from django.core.management.base import BaseCommand
+from django.db.models import Count, Q
+
+from dojo.models import Test
+
+
+class Command(BaseCommand):
+    help = "List the top 25 tests with the most findings"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--limit",
+            type=int,
+            default=25,
+            help="Number of tests to display (default: 25)",
+        )
+
+    def handle(self, *args, **options):
+        limit = options["limit"]
+
+        # Annotate tests with finding counts
+        tests = (
+            Test.objects.annotate(
+                total_findings=Count("finding", distinct=True),
+                active_findings=Count("finding", filter=Q(finding__active=True), distinct=True),
+                duplicate_findings=Count("finding", filter=Q(finding__duplicate=True), distinct=True),
+            )
+            .filter(total_findings__gt=0)
+            .select_related("engagement", "engagement__product", "test_type")
+            .order_by("-total_findings")[:limit]
+        )
+
+        if not tests:
+            self.stdout.write(self.style.WARNING("No tests with findings found."))
+            return
+
+        # Calculate column widths
+        max_product_len = max(
+            (len(str(test.engagement.product.name)) for test in tests),
+            default=20,
+        )
+        max_engagement_len = max(
+            (len(str(test.engagement.name)) for test in tests),
+            default=20,
+        )
+        max_test_len = max(
+            (len(str(test.title or test.id)) for test in tests),
+            default=20,
+        )
+        max_test_type_len = max(
+            (len(str(test.test_type.name)) for test in tests),
+            default=20,
+        )
+        max_dedup_algo_len = max(
+            (len(str(test.deduplication_algorithm)) for test in tests),
+            default=20,
+        )
+
+        # Ensure minimum widths for readability
+        max_product_len = max(max_product_len, 20)
+        max_engagement_len = max(max_engagement_len, 20)
+        max_test_len = max(max_test_len, 20)
+        max_test_type_len = max(max_test_type_len, 20)
+        max_dedup_algo_len = max(max_dedup_algo_len, 20)
+
+        # Header
+        header = (
+            f"{'Product':<{max_product_len}} | "
+            f"{'Engagement':<{max_engagement_len}} | "
+            f"{'Test':<{max_test_len}} | "
+            f"{'Test Type':<{max_test_type_len}} | "
+            f"{'Dedup Algorithm':<{max_dedup_algo_len}} | "
+            f"{'Total':>8} | "
+            f"{'Active':>8} | "
+            f"{'Duplicate':>10}"
+        )
+        separator = "-" * len(header)
+
+        self.stdout.write(self.style.SUCCESS(header))
+        self.stdout.write(separator)
+
+        # Data rows
+        for test in tests:
+            product_name = str(test.engagement.product.name)
+            engagement_name = str(test.engagement.name)
+            test_name = str(test.title or f"Test #{test.id}")
+            test_type_name = str(test.test_type.name)
+            dedup_algo = str(test.deduplication_algorithm)
+            total = test.total_findings
+            active = test.active_findings
+            duplicate = test.duplicate_findings
+
+            row = (
+                f"{product_name:<{max_product_len}} | "
+                f"{engagement_name:<{max_engagement_len}} | "
+                f"{test_name:<{max_test_len}} | "
+                f"{test_type_name:<{max_test_type_len}} | "
+                f"{dedup_algo:<{max_dedup_algo_len}} | "
+                f"{total:>8} | "
+                f"{active:>8} | "
+                f"{duplicate:>10}"
+            )
+            self.stdout.write(row)
+
+        # Summary
+        self.stdout.write(separator)
+        self.stdout.write(
+            self.style.SUCCESS(
+                f"\nDisplayed top {len(tests)} tests with findings.",
+            ),
+        )
diff --git a/dojo/settings/settings.dist.py b/dojo/settings/settings.dist.py
index 2ef1835b08e..ae96077f07a 100644
--- a/dojo/settings/settings.dist.py
+++ b/dojo/settings/settings.dist.py
@@ -283,6 +283,8 @@
     DD_EDITABLE_MITIGATED_DATA=(bool, False),
     # new feature that tracks history across multiple reimports for the same test
     DD_TRACK_IMPORT_HISTORY=(bool, True),
+    # Batch size for reimport candidate matching (finding existing findings)
+    DD_IMPORT_REIMPORT_MATCH_BATCH_SIZE=(int, 1000),
     # Batch size for import/reimport deduplication processing
     DD_IMPORT_REIMPORT_DEDUPE_BATCH_SIZE=(int, 1000),
     # Delete Auditlogs older than x month; -1 to keep all logs
@@ -1710,6 +1712,7 @@ def saml2_attrib_map_format(din):
 DISABLE_FINDING_MERGE = env("DD_DISABLE_FINDING_MERGE")
 
 TRACK_IMPORT_HISTORY = env("DD_TRACK_IMPORT_HISTORY")
+IMPORT_REIMPORT_MATCH_BATCH_SIZE = env("DD_IMPORT_REIMPORT_MATCH_BATCH_SIZE")
 IMPORT_REIMPORT_DEDUPE_BATCH_SIZE = env("DD_IMPORT_REIMPORT_DEDUPE_BATCH_SIZE")
 
 # ------------------------------------------------------------------------------
diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py
index 2f71c720e02..dbf206b808d 100644
--- a/unittests/test_import_reimport.py
+++ b/unittests/test_import_reimport.py
@@ -11,7 +11,7 @@
 from django.urls import reverse
 from django.utils import timezone
 
-from dojo.models import Finding, Test, Test_Type, User
+from dojo.models import Engagement, Finding, Product, Product_Type, Test, Test_Type, User
 
 from .dojo_test_case import DojoAPITestCase, get_unit_tests_scans_path
 from .test_utils import assertTestImportModelsCreated
@@ -108,6 +108,9 @@ def __init__(self, *args, **kwargs):
         self.checkmarx_one_two_false_positive = get_unit_tests_scans_path("checkmarx_one") / "two-false-positive.json"
         self.scan_type_checkmarx_one = "Checkmarx One Scan"
 
+        self.bandit_large_file = get_unit_tests_scans_path("bandit") / "many_vulns.json"
+        self.scan_type_bandit = "Bandit Scan"
+
     # import zap scan, testing:
     # - import
     # - active/verifed = True
@@ -2033,6 +2036,318 @@ def test_reimport_set_scan_date_parser_sets_date(self):
         date = findings["results"][0]["date"]
         self.assertEqual(date, "2006-12-26")
 
+    @override_settings(
+        IMPORT_REIMPORT_DEDUPE_BATCH_SIZE=200,
+        IMPORT_REIMPORT_MATCH_BATCH_SIZE=200,
+    )
+    def test_batch_reimport_large_bandit_file(self):
+        """
+        Test that batch reimport produces identical results to non-batch mode (which we simulate via large max batch size setting).
+
+        Step 1: Import scan (baseline), assess active count, duplicate count
+        Step 2: Reimport scan (same test), assess active count, duplicate count
+        Step 3: Import scan in NEW engagement with batch_size=50, assess active and duplicate equal to step 1
+        Step 4: Reimport scan in same new engagement (batch_size=50), assess active and duplicate equal to step 2
+        """
+        # Step 1: Baseline import (default batch size)
+        # Create engagement first and set deduplication_on_engagement before import
+        product_type1, _ = Product_Type.objects.get_or_create(name="PT Bandit Baseline")
+        product1, _ = Product.objects.get_or_create(name="P Bandit Baseline", prod_type=product_type1)
+        engagement1 = Engagement.objects.create(
+            name="E Bandit Baseline",
+            product=product1,
+            target_start=timezone.now(),
+            target_end=timezone.now(),
+        )
+        engagement1.deduplication_on_engagement = True
+        engagement1.save()
+        engagement1_id = engagement1.id
+
+        import1 = self.import_scan_with_params(
+            self.bandit_large_file,
+            scan_type=self.scan_type_bandit,
+            engagement=engagement1_id,
+            product_type_name=None,
+            product_name=None,
+            engagement_name=None,
+            auto_create_context=False,
+        )
+        test1_id = import1["test"]
+
+        # Assess step 1: active count, duplicate count
+        step1_active = Finding.objects.filter(
+            test__engagement_id=engagement1_id,
+            active=True,
+            duplicate=False,
+        ).count()
+        step1_duplicate = Finding.objects.filter(
+            test__engagement_id=engagement1_id,
+            duplicate=True,
+        ).count()
+
+        # Assert step 1 specific counts
+        self.assertEqual(step1_active, 213, "Step 1 active count")
+        self.assertEqual(step1_duplicate, 1, "Step 1 duplicate count")
+
+        # Step 2: Reimport scan (same test)
+        self.reimport_scan_with_params(
+            test1_id,
+            self.bandit_large_file,
+            scan_type=self.scan_type_bandit,
+        )
+
+        # Assess step 2: active count, duplicate count
+        step2_active = Finding.objects.filter(
+            test__engagement_id=engagement1_id,
+            active=True,
+            duplicate=False,
+        ).count()
+        step2_duplicate = Finding.objects.filter(
+            test__engagement_id=engagement1_id,
+            duplicate=True,
+        ).count()
+
+        # Assert step 2 specific counts
+        self.assertEqual(step2_active, 213, "Step 2 active count")
+        self.assertEqual(step2_duplicate, 1, "Step 2 duplicate count")
+
+        # Step 3: Import scan in NEW engagement with batch_size=50
+        with override_settings(
+            IMPORT_REIMPORT_DEDUPE_BATCH_SIZE=50,
+            IMPORT_REIMPORT_MATCH_BATCH_SIZE=50,
+        ):
+            # Create engagement first and set deduplication_on_engagement before import
+            product_type2, _ = Product_Type.objects.get_or_create(name="PT Bandit Batch")
+            product2, _ = Product.objects.get_or_create(name="P Bandit Batch", prod_type=product_type2)
+            engagement2 = Engagement.objects.create(
+                name="E Bandit Batch",
+                product=product2,
+                target_start=timezone.now(),
+                target_end=timezone.now(),
+            )
+            engagement2.deduplication_on_engagement = True
+            engagement2.save()
+            engagement2_id = engagement2.id
+
+            import2 = self.import_scan_with_params(
+                self.bandit_large_file,
+                scan_type=self.scan_type_bandit,
+                engagement=engagement2_id,
+                product_type_name=None,
+                product_name=None,
+                engagement_name=None,
+                auto_create_context=False,
+            )
+            test2_id = import2["test"]
+
+            # Assess step 3: active and duplicate should equal step 1
+            step3_active = Finding.objects.filter(
+                test__engagement_id=engagement2_id,
+                active=True,
+                duplicate=False,
+            ).count()
+            step3_duplicate = Finding.objects.filter(
+                test__engagement_id=engagement2_id,
+                duplicate=True,
+            ).count()
+
+            self.assertEqual(
+                step3_active,
+                step1_active,
+                "Step 3 active count should equal step 1 (baseline import)",
+            )
+            self.assertEqual(
+                step3_duplicate,
+                step1_duplicate,
+                "Step 3 duplicate count should equal step 1 (baseline import)",
+            )
+
+            # Step 4: Reimport scan in same new engagement (batch_size=50)
+            self.reimport_scan_with_params(
+                test2_id,
+                self.bandit_large_file,
+                scan_type=self.scan_type_bandit,
+            )
+
+            # Assess step 4: active and duplicate should equal step 2
+            step4_active = Finding.objects.filter(
+                test__engagement_id=engagement2_id,
+                active=True,
+                duplicate=False,
+            ).count()
+            step4_duplicate = Finding.objects.filter(
+                test__engagement_id=engagement2_id,
+                duplicate=True,
+            ).count()
+
+            self.assertEqual(
+                step4_active,
+                step2_active,
+                "Step 4 active count should equal step 2 (baseline reimport)",
+            )
+            self.assertEqual(
+                step4_duplicate,
+                step2_duplicate,
+                "Step 4 duplicate count should equal step 2 (baseline reimport)",
+            )
+
+    def test_batch_deduplication_large_bandit_file(self):
+        """
+        Test that batch deduplication produces identical results to non-batch mode (which we simulate via large max batch size setting).
+
+        Step 1: Import scan (baseline), assess active count, duplicate count
+        Step 2: Import scan again (same engagement), assess active count, duplicate count
+        Step 3: Import scan in NEW engagement with batch_size=50, assess active and duplicate equal to step 1
+        Step 4: Import scan again in same new engagement (batch_size=50), assess active and duplicate equal to step 2
+        """
+        # Step 1: Baseline import (default batch size)
+        # Create engagement first and set deduplication_on_engagement before import
+        product_type1, _ = Product_Type.objects.get_or_create(name="PT Bandit Baseline Dedupe")
+        product1, _ = Product.objects.get_or_create(name="P Bandit Baseline Dedupe", prod_type=product_type1)
+        engagement1 = Engagement.objects.create(
+            name="E Bandit Baseline Dedupe",
+            product=product1,
+            target_start=timezone.now(),
+            target_end=timezone.now(),
+        )
+        engagement1.deduplication_on_engagement = True
+        engagement1.save()
+        engagement1_id = engagement1.id
+
+        self.import_scan_with_params(
+            self.bandit_large_file,
+            scan_type=self.scan_type_bandit,
+            engagement=engagement1_id,
+            product_type_name=None,
+            product_name=None,
+            engagement_name=None,
+            auto_create_context=False,
+        )
+
+        # Assess step 1: active count, duplicate count
+        step1_active = Finding.objects.filter(
+            test__engagement_id=engagement1_id,
+            active=True,
+            duplicate=False,
+        ).count()
+        step1_duplicate = Finding.objects.filter(
+            test__engagement_id=engagement1_id,
+            duplicate=True,
+        ).count()
+
+        # Assert step 1 specific counts
+        self.assertEqual(step1_active, 213, "Step 1 active count")
+        self.assertEqual(step1_duplicate, 1, "Step 1 duplicate count")
+
+        # Step 2: Import scan again (same engagement)
+        self.import_scan_with_params(
+            self.bandit_large_file,
+            scan_type=self.scan_type_bandit,
+            engagement=engagement1_id,
+            product_type_name=None,
+            product_name=None,
+            engagement_name=None,
+            auto_create_context=False,
+        )
+
+        # Assess step 2: active count, duplicate count
+        step2_active = Finding.objects.filter(
+            test__engagement_id=engagement1_id,
+            active=True,
+            duplicate=False,
+        ).count()
+        step2_duplicate = Finding.objects.filter(
+            test__engagement_id=engagement1_id,
+            duplicate=True,
+        ).count()
+
+        # Assert step 2 specific counts
+        self.assertEqual(step2_active, 213, "Step 2 active count")
+        self.assertEqual(step2_duplicate, 215, "Step 2 duplicate count")
+
+        # Step 3: Import scan in NEW engagement with batch_size=50
+        with override_settings(
+            IMPORT_REIMPORT_DEDUPE_BATCH_SIZE=50,
+            IMPORT_REIMPORT_MATCH_BATCH_SIZE=50,
+        ):
+            # Create engagement first and set deduplication_on_engagement before import
+            product_type2, _ = Product_Type.objects.get_or_create(name="PT Bandit Batch Dedupe")
+            product2, _ = Product.objects.get_or_create(name="P Bandit Batch Dedupe", prod_type=product_type2)
+            engagement2 = Engagement.objects.create(
+                name="E Bandit Batch Dedupe",
+                product=product2,
+                target_start=timezone.now(),
+                target_end=timezone.now(),
+            )
+            engagement2.deduplication_on_engagement = True
+            engagement2.save()
+            engagement2_id = engagement2.id
+
+            self.import_scan_with_params(
+                self.bandit_large_file,
+                scan_type=self.scan_type_bandit,
+                engagement=engagement2_id,
+                product_type_name=None,
+                product_name=None,
+                engagement_name=None,
+                auto_create_context=False,
+            )
+
+            # Assess step 3: active and duplicate should equal step 1
+            step3_active = Finding.objects.filter(
+                test__engagement_id=engagement2_id,
+                active=True,
+                duplicate=False,
+            ).count()
+            step3_duplicate = Finding.objects.filter(
+                test__engagement_id=engagement2_id,
+                duplicate=True,
+            ).count()
+
+            self.assertEqual(
+                step3_active,
+                step1_active,
+                "Step 3 active count should equal step 1 (baseline import)",
+            )
+            self.assertEqual(
+                step3_duplicate,
+                step1_duplicate,
+                "Step 3 duplicate count should equal step 1 (baseline import)",
+            )
+
+            # Step 4: Import scan again in same new engagement (batch_size=50)
+            self.import_scan_with_params(
+                self.bandit_large_file,
+                scan_type=self.scan_type_bandit,
+                engagement=engagement2_id,
+                product_type_name=None,
+                product_name=None,
+                engagement_name=None,
+                auto_create_context=False,
+            )
+
+            # Assess step 4: active and duplicate should equal step 2
+            step4_active = Finding.objects.filter(
+                test__engagement_id=engagement2_id,
+                active=True,
+                duplicate=False,
+            ).count()
+            step4_duplicate = Finding.objects.filter(
+                test__engagement_id=engagement2_id,
+                duplicate=True,
+            ).count()
+
+            self.assertEqual(
+                step4_active,
+                step2_active,
+                "Step 4 active count should equal step 2 (baseline second import)",
+            )
+            self.assertEqual(
+                step4_duplicate,
+                step2_duplicate,
+                "Step 4 duplicate count should equal step 2 (baseline second import)",
+            )
+
 
 class ImportReimportTestUI(DojoAPITestCase, ImportReimportMixin):
     fixtures = ["dojo_testdata.json"]
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
index 9da777ccecc..1bf1ab87795 100644
--- a/unittests/test_importers_performance.py
+++ b/unittests/test_importers_performance.py
@@ -33,7 +33,9 @@
 STACK_HAWK_SCAN_TYPE = "StackHawk HawkScan"
 
 
-class TestDojoImporterPerformance(DojoTestCase):
+class TestDojoImporterPerformanceBase(DojoTestCase):
+
+    """Base class for performance tests with shared setup and helper methods."""
 
     def setUp(self):
         super().setUp()
@@ -77,35 +79,53 @@ def _assertNumAsyncTask(self, num):
         )
         logger.debug(msg)
 
-    def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3):
-        """
-        Log output can be quite large as when the assertNumQueries fails, all queries are printed.
-        It could be usefule to capture the output in `less`:
-            ./run-unittest.sh --test-case unittests.test_importers_performance.TestDojoImporterPerformance 2>&1 | less
-        Then search for `expected` to find the lines where the expected number of queries is printed.
-        Or you can use `grep` to filter the output:
-            ./run-unittest.sh --test-case unittests.test_importers_performance.TestDojoImporterPerformance 2>&1 | grep expected -B 10
-        """
+    def _create_test_objects(self, product_name, engagement_name):
+        """Helper method to create test product, engagement, lead user, and environment."""
         product_type, _created = Product_Type.objects.get_or_create(name="test")
         product, _created = Product.objects.get_or_create(
-            name="TestDojoDefaultImporter",
+            name=product_name,
             prod_type=product_type,
         )
         engagement, _created = Engagement.objects.get_or_create(
-            name="Test Create Engagement",
+            name=engagement_name,
             product=product,
             target_start=timezone.now(),
             target_end=timezone.now(),
         )
         lead, _ = User.objects.get_or_create(username="admin")
         environment, _ = Development_Environment.objects.get_or_create(name="Development")
+        return product, engagement, lead, environment
+
+    def _import_reimport_performance(
+        self,
+        expected_num_queries1,
+        expected_num_async_tasks1,
+        expected_num_queries2,
+        expected_num_async_tasks2,
+        expected_num_queries3,
+        expected_num_async_tasks3,
+        scan_file1,
+        scan_file2,
+        scan_file3,
+        scan_type,
+        product_name,
+        engagement_name,
+    ):
+        """
+        Test import/reimport/reimport performance with specified scan files and scan type.
+        Log output can be quite large as when the assertNumQueries fails, all queries are printed.
+        """
+        _, engagement, lead, environment = self._create_test_objects(
+            product_name,
+            engagement_name,
+        )
 
-        # first import the subset which missed one finding and a couple of endpoints on some of the findings
+        # First import
         with (
             self.subTest("import1"), impersonate(Dojo_User.objects.get(username="admin")),
             self.assertNumQueries(expected_num_queries1),
             self._assertNumAsyncTask(expected_num_async_tasks1),
-            STACK_HAWK_SUBSET_FILENAME.open(encoding="utf-8") as scan,
+            scan_file1.open(encoding="utf-8") as scan,
         ):
             import_options = {
                 "user": lead,
@@ -116,7 +136,7 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
                 "active": True,
                 "verified": True,
                 "sync": True,
-                "scan_type": STACK_HAWK_SCAN_TYPE,
+                "scan_type": scan_type,
                 "engagement": engagement,
                 "tags": ["performance-test", "tag-in-param", "go-faster"],
                 "apply_tags_to_findings": True,
@@ -124,12 +144,12 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
             importer = DefaultImporter(**import_options)
             test, _, _len_new_findings, _len_closed_findings, _, _, _ = importer.process_scan(scan)
 
-        # use reimport with the full report so it add a finding and some endpoints
+        # Second import (reimport)
         with (
             self.subTest("reimport1"), impersonate(Dojo_User.objects.get(username="admin")),
             self.assertNumQueries(expected_num_queries2),
             self._assertNumAsyncTask(expected_num_async_tasks2),
-            STACK_HAWK_FILENAME.open(encoding="utf-8") as scan,
+            scan_file2.open(encoding="utf-8") as scan,
         ):
             reimport_options = {
                 "test": test,
@@ -140,19 +160,19 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
                 "active": True,
                 "verified": True,
                 "sync": True,
-                "scan_type": STACK_HAWK_SCAN_TYPE,
+                "scan_type": scan_type,
                 "tags": ["performance-test-reimport", "reimport-tag-in-param", "reimport-go-faster"],
                 "apply_tags_to_findings": True,
             }
             reimporter = DefaultReImporter(**reimport_options)
             test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan)
 
-        # use reimport with the subset again to close a finding and mitigate some endpoints
+        # Third import (reimport again)
         with (
             self.subTest("reimport2"), impersonate(Dojo_User.objects.get(username="admin")),
             self.assertNumQueries(expected_num_queries3),
             self._assertNumAsyncTask(expected_num_async_tasks3),
-            STACK_HAWK_SUBSET_FILENAME.open(encoding="utf-8") as scan,
+            scan_file3.open(encoding="utf-8") as scan,
         ):
             reimport_options = {
                 "test": test,
@@ -163,11 +183,40 @@ def _import_reimport_performance(self, expected_num_queries1, expected_num_async
                 "active": True,
                 "verified": True,
                 "sync": True,
-                "scan_type": STACK_HAWK_SCAN_TYPE,
+                "scan_type": scan_type,
             }
             reimporter = DefaultReImporter(**reimport_options)
             test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan)
 
+
+class TestDojoImporterPerformanceSmall(TestDojoImporterPerformanceBase):
+
+    """Performance tests using small sample files (StackHawk, ~6 findings)."""
+
+    def _import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3):
+        """
+        Log output can be quite large as when the assertNumQueries fails, all queries are printed.
+        It could be usefule to capture the output in `less`:
+            ./run-unittest.sh --test-case unittests.test_importers_performance.TestDojoImporterPerformanceSmall 2>&1 | less
+        Then search for `expected` to find the lines where the expected number of queries is printed.
+        Or you can use `grep` to filter the output:
+            ./run-unittest.sh --test-case unittests.test_importers_performance.TestDojoImporterPerformanceSmall 2>&1 | grep expected -B 10
+        """
+        return super()._import_reimport_performance(
+            expected_num_queries1,
+            expected_num_async_tasks1,
+            expected_num_queries2,
+            expected_num_async_tasks2,
+            expected_num_queries3,
+            expected_num_async_tasks3,
+            scan_file1=STACK_HAWK_SUBSET_FILENAME,
+            scan_file2=STACK_HAWK_FILENAME,
+            scan_file3=STACK_HAWK_SUBSET_FILENAME,
+            scan_type=STACK_HAWK_SCAN_TYPE,
+            product_name="TestDojoDefaultImporter",
+            engagement_name="Test Create Engagement",
+        )
+
     @override_settings(ENABLE_AUDITLOG=True, AUDITLOG_TYPE="django-auditlog")
     def test_import_reimport_reimport_performance_async(self):
         # Ensure django-auditlog is properly configured for this test
@@ -177,9 +226,9 @@ def test_import_reimport_reimport_performance_async(self):
         self._import_reimport_performance(
             expected_num_queries1=340,
             expected_num_async_tasks1=7,
-            expected_num_queries2=288,
+            expected_num_queries2=274,
             expected_num_async_tasks2=18,
-            expected_num_queries3=175,
+            expected_num_queries3=162,
             expected_num_async_tasks3=17,
         )
 
@@ -195,9 +244,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
         self._import_reimport_performance(
             expected_num_queries1=306,
             expected_num_async_tasks1=7,
-            expected_num_queries2=281,
+            expected_num_queries2=267,
             expected_num_async_tasks2=18,
-            expected_num_queries3=170,
+            expected_num_queries3=157,
             expected_num_async_tasks3=17,
         )
 
@@ -219,9 +268,9 @@ def test_import_reimport_reimport_performance_no_async(self):
         self._import_reimport_performance(
             expected_num_queries1=345,
             expected_num_async_tasks1=6,
-            expected_num_queries2=293,
+            expected_num_queries2=279,
             expected_num_async_tasks2=17,
-            expected_num_queries3=180,
+            expected_num_queries3=167,
             expected_num_async_tasks3=16,
         )
 
@@ -241,9 +290,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
         self._import_reimport_performance(
             expected_num_queries1=311,
             expected_num_async_tasks1=6,
-            expected_num_queries2=286,
+            expected_num_queries2=272,
             expected_num_async_tasks2=17,
-            expected_num_queries3=175,
+            expected_num_queries3=162,
             expected_num_async_tasks3=16,
         )
 
@@ -267,9 +316,9 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self
         self._import_reimport_performance(
             expected_num_queries1=347,
             expected_num_async_tasks1=8,
-            expected_num_queries2=295,
+            expected_num_queries2=281,
             expected_num_async_tasks2=19,
-            expected_num_queries3=182,
+            expected_num_queries3=169,
             expected_num_async_tasks3=18,
         )
 
@@ -290,9 +339,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
         self._import_reimport_performance(
             expected_num_queries1=313,
             expected_num_async_tasks1=8,
-            expected_num_queries2=288,
+            expected_num_queries2=274,
             expected_num_async_tasks2=19,
-            expected_num_queries3=177,
+            expected_num_queries3=164,
             expected_num_async_tasks3=18,
         )
 
@@ -303,19 +352,10 @@ def _deduplication_performance(self, expected_num_queries1, expected_num_async_t
         The second import should result in all findings being marked as duplicates.
         This is different from reimport as we create a new test each time.
         """
-        product_type, _created = Product_Type.objects.get_or_create(name="test")
-        product, _created = Product.objects.get_or_create(
-            name="TestDojoDeduplicationPerformance",
-            prod_type=product_type,
-        )
-        engagement, _created = Engagement.objects.get_or_create(
-            name="Test Deduplication Performance Engagement",
-            product=product,
-            target_start=timezone.now(),
-            target_end=timezone.now(),
+        _, engagement, lead, environment = self._create_test_objects(
+            "TestDojoDeduplicationPerformance",
+            "Test Deduplication Performance Engagement",
         )
-        lead, _ = User.objects.get_or_create(username="admin")
-        environment, _ = Development_Environment.objects.get_or_create(name="Development")
 
         # First import - all findings should be new
         with (

From c1f800470b8e2199a7ab78a4de57af45d64d33f2 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sat, 13 Dec 2025 12:38:26 +0100
Subject: [PATCH 03/26] tests: always show all mismatched counts

---
 unittests/test_import_reimport.py       | 118 ++++++++------
 unittests/test_importers_performance.py | 200 ++++++++++++++----------
 2 files changed, 189 insertions(+), 129 deletions(-)

diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py
index dbf206b808d..2ac71c155bb 100644
--- a/unittests/test_import_reimport.py
+++ b/unittests/test_import_reimport.py
@@ -2086,8 +2086,13 @@ def test_batch_reimport_large_bandit_file(self):
         ).count()
 
         # Assert step 1 specific counts
-        self.assertEqual(step1_active, 213, "Step 1 active count")
-        self.assertEqual(step1_duplicate, 1, "Step 1 duplicate count")
+        # Each assertion is wrapped in its own subTest so that if one fails, the others still run.
+        # This allows us to see all count mismatches in a single test run, making it easier to fix
+        # all incorrect expected values at once rather than fixing them one at a time.
+        with self.subTest(step=1, metric="active"):
+            self.assertEqual(step1_active, 213, "Step 1 active count")
+        with self.subTest(step=1, metric="duplicate"):
+            self.assertEqual(step1_duplicate, 1, "Step 1 duplicate count")
 
         # Step 2: Reimport scan (same test)
         self.reimport_scan_with_params(
@@ -2108,8 +2113,13 @@ def test_batch_reimport_large_bandit_file(self):
         ).count()
 
         # Assert step 2 specific counts
-        self.assertEqual(step2_active, 213, "Step 2 active count")
-        self.assertEqual(step2_duplicate, 1, "Step 2 duplicate count")
+        # Each assertion is wrapped in its own subTest so that if one fails, the others still run.
+        # This allows us to see all count mismatches in a single test run, making it easier to fix
+        # all incorrect expected values at once rather than fixing them one at a time.
+        with self.subTest(step=2, metric="active"):
+            self.assertEqual(step2_active, 213, "Step 2 active count")
+        with self.subTest(step=2, metric="duplicate"):
+            self.assertEqual(step2_duplicate, 1, "Step 2 duplicate count")
 
         # Step 3: Import scan in NEW engagement with batch_size=50
         with override_settings(
@@ -2151,16 +2161,21 @@ def test_batch_reimport_large_bandit_file(self):
                 duplicate=True,
             ).count()
 
-            self.assertEqual(
-                step3_active,
-                step1_active,
-                "Step 3 active count should equal step 1 (baseline import)",
-            )
-            self.assertEqual(
-                step3_duplicate,
-                step1_duplicate,
-                "Step 3 duplicate count should equal step 1 (baseline import)",
-            )
+            # Each assertion is wrapped in its own subTest so that if one fails, the others still run.
+            # This allows us to see all count mismatches in a single test run, making it easier to fix
+            # all incorrect expected values at once rather than fixing them one at a time.
+            with self.subTest(step=3, metric="active"):
+                self.assertEqual(
+                    step3_active,
+                    step1_active,
+                    "Step 3 active count should equal step 1 (baseline import)",
+                )
+            with self.subTest(step=3, metric="duplicate"):
+                self.assertEqual(
+                    step3_duplicate,
+                    step1_duplicate,
+                    "Step 3 duplicate count should equal step 1 (baseline import)",
+                )
 
             # Step 4: Reimport scan in same new engagement (batch_size=50)
             self.reimport_scan_with_params(
@@ -2180,16 +2195,21 @@ def test_batch_reimport_large_bandit_file(self):
                 duplicate=True,
             ).count()
 
-            self.assertEqual(
-                step4_active,
-                step2_active,
-                "Step 4 active count should equal step 2 (baseline reimport)",
-            )
-            self.assertEqual(
-                step4_duplicate,
-                step2_duplicate,
-                "Step 4 duplicate count should equal step 2 (baseline reimport)",
-            )
+            # Each assertion is wrapped in its own subTest so that if one fails, the others still run.
+            # This allows us to see all count mismatches in a single test run, making it easier to fix
+            # all incorrect expected values at once rather than fixing them one at a time.
+            with self.subTest(step=4, metric="active"):
+                self.assertEqual(
+                    step4_active,
+                    step2_active,
+                    "Step 4 active count should equal step 2 (baseline reimport)",
+                )
+            with self.subTest(step=4, metric="duplicate"):
+                self.assertEqual(
+                    step4_duplicate,
+                    step2_duplicate,
+                    "Step 4 duplicate count should equal step 2 (baseline reimport)",
+                )
 
     def test_batch_deduplication_large_bandit_file(self):
         """
@@ -2304,16 +2324,21 @@ def test_batch_deduplication_large_bandit_file(self):
                 duplicate=True,
             ).count()
 
-            self.assertEqual(
-                step3_active,
-                step1_active,
-                "Step 3 active count should equal step 1 (baseline import)",
-            )
-            self.assertEqual(
-                step3_duplicate,
-                step1_duplicate,
-                "Step 3 duplicate count should equal step 1 (baseline import)",
-            )
+            # Each assertion is wrapped in its own subTest so that if one fails, the others still run.
+            # This allows us to see all count mismatches in a single test run, making it easier to fix
+            # all incorrect expected values at once rather than fixing them one at a time.
+            with self.subTest(step=3, metric="active"):
+                self.assertEqual(
+                    step3_active,
+                    step1_active,
+                    "Step 3 active count should equal step 1 (baseline import)",
+                )
+            with self.subTest(step=3, metric="duplicate"):
+                self.assertEqual(
+                    step3_duplicate,
+                    step1_duplicate,
+                    "Step 3 duplicate count should equal step 1 (baseline import)",
+                )
 
             # Step 4: Import scan again in same new engagement (batch_size=50)
             self.import_scan_with_params(
@@ -2337,16 +2362,21 @@ def test_batch_deduplication_large_bandit_file(self):
                 duplicate=True,
             ).count()
 
-            self.assertEqual(
-                step4_active,
-                step2_active,
-                "Step 4 active count should equal step 2 (baseline second import)",
-            )
-            self.assertEqual(
-                step4_duplicate,
-                step2_duplicate,
-                "Step 4 duplicate count should equal step 2 (baseline second import)",
-            )
+            # Each assertion is wrapped in its own subTest so that if one fails, the others still run.
+            # This allows us to see all count mismatches in a single test run, making it easier to fix
+            # all incorrect expected values at once rather than fixing them one at a time.
+            with self.subTest(step=4, metric="active"):
+                self.assertEqual(
+                    step4_active,
+                    step2_active,
+                    "Step 4 active count should equal step 2 (baseline second import)",
+                )
+            with self.subTest(step=4, metric="duplicate"):
+                self.assertEqual(
+                    step4_duplicate,
+                    step2_duplicate,
+                    "Step 4 duplicate count should equal step 2 (baseline second import)",
+                )
 
 
 class ImportReimportTestUI(DojoAPITestCase, ImportReimportMixin):
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
index 1bf1ab87795..217ed318ea2 100644
--- a/unittests/test_importers_performance.py
+++ b/unittests/test_importers_performance.py
@@ -121,72 +121,90 @@ def _import_reimport_performance(
         )
 
         # First import
-        with (
+        # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run.
+        # This allows us to see all count mismatches in a single test run, making it easier to fix
+        # all incorrect expected values at once rather than fixing them one at a time.
+        # Nested with statements are intentional - each assertion needs its own subTest wrapper.
+        with (  # noqa: SIM117
             self.subTest("import1"), impersonate(Dojo_User.objects.get(username="admin")),
-            self.assertNumQueries(expected_num_queries1),
-            self._assertNumAsyncTask(expected_num_async_tasks1),
             scan_file1.open(encoding="utf-8") as scan,
         ):
-            import_options = {
-                "user": lead,
-                "lead": lead,
-                "scan_date": None,
-                "environment": environment,
-                "minimum_severity": "Info",
-                "active": True,
-                "verified": True,
-                "sync": True,
-                "scan_type": scan_type,
-                "engagement": engagement,
-                "tags": ["performance-test", "tag-in-param", "go-faster"],
-                "apply_tags_to_findings": True,
-            }
-            importer = DefaultImporter(**import_options)
-            test, _, _len_new_findings, _len_closed_findings, _, _, _ = importer.process_scan(scan)
+            with self.subTest(step="import1", metric="queries"):
+                with self.assertNumQueries(expected_num_queries1):
+                    with self.subTest(step="import1", metric="async_tasks"):
+                        with self._assertNumAsyncTask(expected_num_async_tasks1):
+                            import_options = {
+                                "user": lead,
+                                "lead": lead,
+                                "scan_date": None,
+                                "environment": environment,
+                                "minimum_severity": "Info",
+                                "active": True,
+                                "verified": True,
+                                "sync": True,
+                                "scan_type": scan_type,
+                                "engagement": engagement,
+                                "tags": ["performance-test", "tag-in-param", "go-faster"],
+                                "apply_tags_to_findings": True,
+                            }
+                            importer = DefaultImporter(**import_options)
+                            test, _, _len_new_findings, _len_closed_findings, _, _, _ = importer.process_scan(scan)
 
         # Second import (reimport)
-        with (
+        # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run.
+        # This allows us to see all count mismatches in a single test run, making it easier to fix
+        # all incorrect expected values at once rather than fixing them one at a time.
+        # Nested with statements are intentional - each assertion needs its own subTest wrapper.
+        with (  # noqa: SIM117
             self.subTest("reimport1"), impersonate(Dojo_User.objects.get(username="admin")),
-            self.assertNumQueries(expected_num_queries2),
-            self._assertNumAsyncTask(expected_num_async_tasks2),
             scan_file2.open(encoding="utf-8") as scan,
         ):
-            reimport_options = {
-                "test": test,
-                "user": lead,
-                "lead": lead,
-                "scan_date": None,
-                "minimum_severity": "Info",
-                "active": True,
-                "verified": True,
-                "sync": True,
-                "scan_type": scan_type,
-                "tags": ["performance-test-reimport", "reimport-tag-in-param", "reimport-go-faster"],
-                "apply_tags_to_findings": True,
-            }
-            reimporter = DefaultReImporter(**reimport_options)
-            test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan)
+            with self.subTest(step="reimport1", metric="queries"):
+                with self.assertNumQueries(expected_num_queries2):
+                    with self.subTest(step="reimport1", metric="async_tasks"):
+                        with self._assertNumAsyncTask(expected_num_async_tasks2):
+                            reimport_options = {
+                                "test": test,
+                                "user": lead,
+                                "lead": lead,
+                                "scan_date": None,
+                                "minimum_severity": "Info",
+                                "active": True,
+                                "verified": True,
+                                "sync": True,
+                                "scan_type": scan_type,
+                                "tags": ["performance-test-reimport", "reimport-tag-in-param", "reimport-go-faster"],
+                                "apply_tags_to_findings": True,
+                            }
+                            reimporter = DefaultReImporter(**reimport_options)
+                            test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan)
 
         # Third import (reimport again)
-        with (
+        # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run.
+        # This allows us to see all count mismatches in a single test run, making it easier to fix
+        # all incorrect expected values at once rather than fixing them one at a time.
+        # Nested with statements are intentional - each assertion needs its own subTest wrapper.
+        with (  # noqa: SIM117
             self.subTest("reimport2"), impersonate(Dojo_User.objects.get(username="admin")),
-            self.assertNumQueries(expected_num_queries3),
-            self._assertNumAsyncTask(expected_num_async_tasks3),
             scan_file3.open(encoding="utf-8") as scan,
         ):
-            reimport_options = {
-                "test": test,
-                "user": lead,
-                "lead": lead,
-                "scan_date": None,
-                "minimum_severity": "Info",
-                "active": True,
-                "verified": True,
-                "sync": True,
-                "scan_type": scan_type,
-            }
-            reimporter = DefaultReImporter(**reimport_options)
-            test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan)
+            with self.subTest(step="reimport2", metric="queries"):
+                with self.assertNumQueries(expected_num_queries3):
+                    with self.subTest(step="reimport2", metric="async_tasks"):
+                        with self._assertNumAsyncTask(expected_num_async_tasks3):
+                            reimport_options = {
+                                "test": test,
+                                "user": lead,
+                                "lead": lead,
+                                "scan_date": None,
+                                "minimum_severity": "Info",
+                                "active": True,
+                                "verified": True,
+                                "sync": True,
+                                "scan_type": scan_type,
+                            }
+                            reimporter = DefaultReImporter(**reimport_options)
+                            test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan)
 
 
 class TestDojoImporterPerformanceSmall(TestDojoImporterPerformanceBase):
@@ -358,46 +376,58 @@ def _deduplication_performance(self, expected_num_queries1, expected_num_async_t
         )
 
         # First import - all findings should be new
-        with (
+        # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run.
+        # This allows us to see all count mismatches in a single test run, making it easier to fix
+        # all incorrect expected values at once rather than fixing them one at a time.
+        # Nested with statements are intentional - each assertion needs its own subTest wrapper.
+        with (  # noqa: SIM117
             self.subTest("first_import"), impersonate(Dojo_User.objects.get(username="admin")),
-            self.assertNumQueries(expected_num_queries1),
-            self._assertNumAsyncTask(expected_num_async_tasks1),
             STACK_HAWK_FILENAME.open(encoding="utf-8") as scan,
         ):
-            import_options = {
-                "user": lead,
-                "lead": lead,
-                "scan_date": None,
-                "environment": environment,
-                "minimum_severity": "Info",
-                "active": True,
-                "verified": True,
-                "scan_type": STACK_HAWK_SCAN_TYPE,
-                "engagement": engagement,
-            }
-            importer = DefaultImporter(**import_options)
-            _, _, len_new_findings1, len_closed_findings1, _, _, _ = importer.process_scan(scan)
+            with self.subTest(step="first_import", metric="queries"):
+                with self.assertNumQueries(expected_num_queries1):
+                    with self.subTest(step="first_import", metric="async_tasks"):
+                        with self._assertNumAsyncTask(expected_num_async_tasks1):
+                            import_options = {
+                                "user": lead,
+                                "lead": lead,
+                                "scan_date": None,
+                                "environment": environment,
+                                "minimum_severity": "Info",
+                                "active": True,
+                                "verified": True,
+                                "scan_type": STACK_HAWK_SCAN_TYPE,
+                                "engagement": engagement,
+                            }
+                            importer = DefaultImporter(**import_options)
+                            _, _, len_new_findings1, len_closed_findings1, _, _, _ = importer.process_scan(scan)
 
         # Second import - all findings should be duplicates
-        with (
+        # Each assertion context manager is wrapped in its own subTest so that if one fails, the others still run.
+        # This allows us to see all count mismatches in a single test run, making it easier to fix
+        # all incorrect expected values at once rather than fixing them one at a time.
+        # Nested with statements are intentional - each assertion needs its own subTest wrapper.
+        with (  # noqa: SIM117
             self.subTest("second_import"), impersonate(Dojo_User.objects.get(username="admin")),
-            self.assertNumQueries(expected_num_queries2),
-            self._assertNumAsyncTask(expected_num_async_tasks2),
             STACK_HAWK_FILENAME.open(encoding="utf-8") as scan,
         ):
-            import_options = {
-                "user": lead,
-                "lead": lead,
-                "scan_date": None,
-                "environment": environment,
-                "minimum_severity": "Info",
-                "active": True,
-                "verified": True,
-                "scan_type": STACK_HAWK_SCAN_TYPE,
-                "engagement": engagement,
-            }
-            importer = DefaultImporter(**import_options)
-            _, _, len_new_findings2, len_closed_findings2, _, _, _ = importer.process_scan(scan)
+            with self.subTest(step="second_import", metric="queries"):
+                with self.assertNumQueries(expected_num_queries2):
+                    with self.subTest(step="second_import", metric="async_tasks"):
+                        with self._assertNumAsyncTask(expected_num_async_tasks2):
+                            import_options = {
+                                "user": lead,
+                                "lead": lead,
+                                "scan_date": None,
+                                "environment": environment,
+                                "minimum_severity": "Info",
+                                "active": True,
+                                "verified": True,
+                                "scan_type": STACK_HAWK_SCAN_TYPE,
+                                "engagement": engagement,
+                            }
+                            importer = DefaultImporter(**import_options)
+                            _, _, len_new_findings2, len_closed_findings2, _, _, _ = importer.process_scan(scan)
 
         # Log the results for analysis
         logger.debug(f"First import: {len_new_findings1} new findings, {len_closed_findings1} closed findings")

From 6e7da13bf7729362715e39491655b5acdcaabe0b Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sat, 13 Dec 2025 13:05:23 +0100
Subject: [PATCH 04/26] reimport: add test for internal duplicates during
 matching

---
 unittests/test_import_reimport.py | 112 ++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py
index 2ac71c155bb..539f879adae 100644
--- a/unittests/test_import_reimport.py
+++ b/unittests/test_import_reimport.py
@@ -2378,6 +2378,118 @@ def test_batch_deduplication_large_bandit_file(self):
                     "Step 4 duplicate count should equal step 2 (baseline second import)",
                 )
 
+    @override_settings(
+        IMPORT_REIMPORT_DEDUPE_BATCH_SIZE=50,
+        IMPORT_REIMPORT_MATCH_BATCH_SIZE=50,
+    )
+    def test_reimport_with_internal_duplicates_same_batch(self):
+        """
+        Test that duplicates within the same scan report are detected during reimport
+        even when they're processed in the same batch.
+
+        This test mimics the integration test test_import_path_tests in tests/dedupe_test.py,
+        which tests the same scenario but uses Selenium UI testing.
+
+        BACKGROUND:
+        When reimporting a scan report, the system uses batch processing to optimize
+        database queries. Findings are processed in batches (default: 1000), and for
+        each batch, candidate matches are fetched from the database once. However, this
+        creates a bug: if multiple duplicate findings exist within the same scan report
+        and they're processed in the same batch, they cannot match each other because
+        the candidate dictionary is built before any findings in the batch are saved.
+
+        THE BUG:
+        - Finding #1 in batch: No matches found → Creates new finding
+        - Finding #2 in batch: No matches found (Finding #1 not in DB yet) → Creates duplicate
+        - Finding #3 in batch: No matches found (Findings #1 and #2 not in DB yet) → Creates duplicate
+        Result: 3 findings created instead of 1 unique finding
+
+        THIS TEST:
+        This test reproduces the bug by:
+        1. Creating an empty test (no existing findings to match against)
+        2. Reimporting dedupe_path_1.json which contains 3 duplicate findings:
+           - All have same file_path: "/opt/appsecpipeline/source/dojo/cred/views.py"
+           - All have same line_number: 524
+           - All have same test_id: "B110"
+           - All should have the same hash_code and match each other
+        3. Using small batch size (50) to ensure all 3 duplicates are in the same batch
+        4. Verifying that only 1 finding is created (not 3)
+
+        EXPECTED BEHAVIOR:
+        - Total findings: 1 (the 3 duplicates should match to 1 unique finding)
+        - Active findings: 1
+        - Duplicate findings: 0 (duplicates within report should match, not be marked as duplicates)
+
+        CURRENT BUGGY BEHAVIOR:
+        - Total findings: 3 (all duplicates are created because they can't match each other)
+        - Active findings: 1 (first one)
+        - Duplicate findings: 2 (second and third are marked as duplicates of first)
+
+        This test should fail with the current buggy implementation and pass after
+        implementing intra-batch duplicate tracking.
+        """
+        # Use the dedupe_path_1.json file from integration tests (has 3 duplicate findings)
+        # Note: This file is in tests/dedupe_scans, not unittests/scans
+        dedupe_path_file = Path(__file__).parent.parent / "tests" / "dedupe_scans" / "dedupe_path_1.json"
+
+        # Create engagement and test
+        product_type, _ = Product_Type.objects.get_or_create(name="PT Bandit Internal Dupes")
+        product, _ = Product.objects.get_or_create(name="P Bandit Internal Dupes", prod_type=product_type)
+        engagement = Engagement.objects.create(
+            name="E Bandit Internal Dupes",
+            product=product,
+            target_start=timezone.now(),
+            target_end=timezone.now(),
+        )
+        engagement.deduplication_on_engagement = True
+        engagement.save()
+
+        # Create an empty test first
+        # Get or create the test type
+        test_type, _ = Test_Type.objects.get_or_create(name="Bandit Scan")
+        test = Test.objects.create(
+            engagement=engagement,
+            test_type=test_type,
+            title="Path Test 1",
+            target_start=timezone.now(),
+            target_end=timezone.now(),
+        )
+        test_id = test.id
+
+        # Now reimport the file with duplicates to the empty test
+        # This tests reimport batch matching where duplicates within the same report need to match each other
+        reimport_result = self.reimport_scan_with_params(
+            test_id,
+            dedupe_path_file,
+            scan_type="Bandit Scan",
+        )
+
+        test_id = reimport_result["test"]
+
+        # Verify findings count
+        # dedupe_path_1.json has 3 findings that are duplicates (same file_path, line_number, test_id)
+        # They should all match each other, resulting in 1 unique finding
+        # However, if intra-batch matching fails, we'll get 3 findings created instead of 1
+        total_findings = Finding.objects.filter(test_id=test_id).count()
+        active_findings = Finding.objects.filter(test_id=test_id, active=True, duplicate=False).count()
+        duplicate_findings = Finding.objects.filter(test_id=test_id, duplicate=True).count()
+
+        # Expected: 1 active finding (the 3 duplicates should match each other)
+        # If intra-batch matching fails, we'll get 3 findings instead
+        with self.subTest(metric="total_findings"):
+            self.assertEqual(total_findings, 1,
+                f"Expected 1 total finding (3 duplicates should match to 1), got {total_findings}. "
+                f"If this is 3, intra-batch duplicates weren't detected.")
+
+        with self.subTest(metric="active_findings"):
+            self.assertEqual(active_findings, 1,
+                f"Expected 1 active finding, got {active_findings}. "
+                f"If this fails, intra-batch duplicates weren't detected.")
+
+        with self.subTest(metric="duplicate_findings"):
+            self.assertEqual(duplicate_findings, 0,
+                f"Expected 0 duplicate findings (duplicates within report should match), got {duplicate_findings}")
+
 
 class ImportReimportTestUI(DojoAPITestCase, ImportReimportMixin):
     fixtures = ["dojo_testdata.json"]

From ae0bce2ae6acd31a231e818bc7faac57217773b5 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sat, 13 Dec 2025 16:51:10 +0100
Subject: [PATCH 05/26] reimport: fix handling of duplicates inside the same
 report/batch

---
 dojo/importers/default_reimporter.py | 29 +++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
index 2e426ba2fda..340f7c27cd0 100644
--- a/dojo/importers/default_reimporter.py
+++ b/dojo/importers/default_reimporter.py
@@ -189,8 +189,6 @@ def process_findings(
         self.reactivated_items = []
         self.unchanged_items = []
         self.group_names_to_findings_dict = {}
-        # Progressive batching for chord execution
-        # No chord: we dispatch per 1000 findings or on the final finding
 
         logger.debug(f"starting reimport of {len(parsed_findings) if parsed_findings else 0} items.")
         logger.debug("STEP 1: looping over findings from the reimported report and trying to match them to existing findings")
@@ -244,9 +242,9 @@ def process_findings(
                 deduplicationLogger.debug(f"unsaved finding's hash_code: {unsaved_finding.hash_code}")
 
             # Fetch all candidates for this batch at once (batch candidate finding)
-            candidates_by_hash = None
-            candidates_by_uid = None
-            candidates_by_key = None
+            candidates_by_hash = {}
+            candidates_by_uid = {}
+            candidates_by_key = {}
 
             if self.deduplication_algorithm == "hash_code":
                 candidates_by_hash = find_candidates_for_deduplication_hash(
@@ -266,6 +264,7 @@ def process_findings(
             # Process each finding in the batch using pre-fetched candidates
             for idx, unsaved_finding in enumerate(batch_findings):
                 is_final = is_final_batch and idx == len(batch_findings) - 1
+
                 # Match any findings to this new one coming in using pre-fetched candidates
                 matched_findings = self.match_finding_for_reimport(
                     unsaved_finding,
@@ -297,6 +296,26 @@ def process_findings(
                         )
                 else:
                     finding = self.process_finding_that_was_not_matched(unsaved_finding)
+
+                    # Add newly created finding to candidates for subsequent findings in this batch
+                    if finding:
+                        if finding.hash_code:
+                            candidates_by_hash.setdefault(finding.hash_code, []).append(finding)
+                            deduplicationLogger.debug(
+                                f"Added finding {finding.id} (hash_code: {finding.hash_code}) to candidates for next findings in this report",
+                            )
+                        if finding.unique_id_from_tool:
+                            candidates_by_uid.setdefault(finding.unique_id_from_tool, []).append(finding)
+                            deduplicationLogger.debug(
+                                f"Added finding {finding.id} (unique_id_from_tool: {finding.unique_id_from_tool}) to candidates for next findings in this report",
+                            )
+                        if finding.title:
+                            legacy_key = (finding.title.lower(), finding.severity)
+                            candidates_by_key.setdefault(legacy_key, []).append(finding)
+                            deduplicationLogger.debug(
+                                f"Added finding {finding.id} (title: {finding.title}, severity: {finding.severity}) to candidates for next findings in this report",
+                            )
+
                 # This condition __appears__ to always be true, but am afraid to remove it
                 if finding:
                     # Process the rest of the items on the finding

From a427db7c5b5d8a42c25b6e72dc4bf6e8f89b6a21 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sat, 13 Dec 2025 18:24:01 +0100
Subject: [PATCH 06/26] add id

---
 dojo/management/commands/list_top_tests.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dojo/management/commands/list_top_tests.py b/dojo/management/commands/list_top_tests.py
index 839a1dc03fd..7666d707101 100644
--- a/dojo/management/commands/list_top_tests.py
+++ b/dojo/management/commands/list_top_tests.py
@@ -35,6 +35,10 @@ def handle(self, *args, **options):
             return
 
         # Calculate column widths
+        max_test_id_len = max(
+            (len(str(test.id)) for test in tests),
+            default=8,
+        )
         max_product_len = max(
             (len(str(test.engagement.product.name)) for test in tests),
             default=20,
@@ -57,6 +61,7 @@ def handle(self, *args, **options):
         )
 
         # Ensure minimum widths for readability
+        max_test_id_len = max(max_test_id_len, 8)
         max_product_len = max(max_product_len, 20)
         max_engagement_len = max(max_engagement_len, 20)
         max_test_len = max(max_test_len, 20)
@@ -65,6 +70,7 @@ def handle(self, *args, **options):
 
         # Header
         header = (
+            f"{'Test ID':<{max_test_id_len}} | "
             f"{'Product':<{max_product_len}} | "
             f"{'Engagement':<{max_engagement_len}} | "
             f"{'Test':<{max_test_len}} | "
@@ -81,6 +87,7 @@ def handle(self, *args, **options):
 
         # Data rows
         for test in tests:
+            test_id = str(test.id)
             product_name = str(test.engagement.product.name)
             engagement_name = str(test.engagement.name)
             test_name = str(test.title or f"Test #{test.id}")
@@ -91,6 +98,7 @@ def handle(self, *args, **options):
             duplicate = test.duplicate_findings
 
             row = (
+                f"{test_id:<{max_test_id_len}} | "
                 f"{product_name:<{max_product_len}} | "
                 f"{engagement_name:<{max_engagement_len}} | "
                 f"{test_name:<{max_test_len}} | "

From c5c7a0d8f9a6cc78548884b6da1a73d99941a2d4 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sat, 13 Dec 2025 18:49:00 +0100
Subject: [PATCH 07/26] reimport: optimize vulnerability_id processing

---
 dojo/finding/deduplication.py           |  2 +-
 dojo/importers/base_importer.py         | 27 ++++++++++++++++-
 unittests/test_importers_performance.py | 40 ++++++++++++-------------
 3 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py
index b51c791c199..b397add4efd 100644
--- a/dojo/finding/deduplication.py
+++ b/dojo/finding/deduplication.py
@@ -255,7 +255,7 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None):
     return (
         queryset
         .select_related("test", "test__engagement", "test__test_type")
-        .prefetch_related("endpoints")
+        .prefetch_related("endpoints", "vulnerability_id_set")
     )
 
 
diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py
index 096161db3fa..ab10f1128ca 100644
--- a/dojo/importers/base_importer.py
+++ b/dojo/importers/base_importer.py
@@ -793,9 +793,34 @@ def process_vulnerability_ids(
     ) -> Finding:
         """
         Parse the `unsaved_vulnerability_ids` field from findings after they are parsed
-        to create `Vulnerability_Id` objects with the finding associated correctly
+        to create `Vulnerability_Id` objects with the finding associated correctly.
+        Only updates if vulnerability_ids have changed to avoid unnecessary database operations.
+
+        Args:
+            finding: The finding to process vulnerability IDs for
+
+        Returns:
+            The finding object
+
         """
         if finding.unsaved_vulnerability_ids:
+            # Only check for changes if the finding has been saved and might have existing vulnerability_ids
+            # For new findings, we always need to save vulnerability_ids
+            if finding.pk and finding.vulnerability_id_set.exists():
+                # Check if vulnerability_ids have changed before updating
+                # Get existing vulnerability IDs from the database
+                existing_vuln_ids = set(finding.vulnerability_ids) if finding.vulnerability_ids else set()
+                # Normalize the new vulnerability IDs (remove duplicates for comparison)
+                new_vuln_ids = set(finding.unsaved_vulnerability_ids)
+
+                # Only update if vulnerability IDs have changed
+                if existing_vuln_ids == new_vuln_ids:
+                    logger.debug(
+                        f"Skipping vulnerability_ids update for finding {finding.id} - "
+                        f"vulnerability_ids unchanged: {sorted(existing_vuln_ids)}",
+                    )
+                    return finding
+
             # Remove old vulnerability ids - keeping this call only because of flake8
             Vulnerability_Id.objects.filter(finding=finding).delete()
 
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
index 217ed318ea2..c422140a433 100644
--- a/unittests/test_importers_performance.py
+++ b/unittests/test_importers_performance.py
@@ -244,9 +244,9 @@ def test_import_reimport_reimport_performance_async(self):
         self._import_reimport_performance(
             expected_num_queries1=340,
             expected_num_async_tasks1=7,
-            expected_num_queries2=274,
+            expected_num_queries2=275,
             expected_num_async_tasks2=18,
-            expected_num_queries3=162,
+            expected_num_queries3=163,
             expected_num_async_tasks3=17,
         )
 
@@ -262,9 +262,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
         self._import_reimport_performance(
             expected_num_queries1=306,
             expected_num_async_tasks1=7,
-            expected_num_queries2=267,
+            expected_num_queries2=268,
             expected_num_async_tasks2=18,
-            expected_num_queries3=157,
+            expected_num_queries3=158,
             expected_num_async_tasks3=17,
         )
 
@@ -284,11 +284,11 @@ def test_import_reimport_reimport_performance_no_async(self):
         testuser.usercontactinfo.block_execution = True
         testuser.usercontactinfo.save()
         self._import_reimport_performance(
-            expected_num_queries1=345,
+            expected_num_queries1=346,
             expected_num_async_tasks1=6,
-            expected_num_queries2=279,
+            expected_num_queries2=281,
             expected_num_async_tasks2=17,
-            expected_num_queries3=167,
+            expected_num_queries3=169,
             expected_num_async_tasks3=16,
         )
 
@@ -306,11 +306,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
         testuser.usercontactinfo.save()
 
         self._import_reimport_performance(
-            expected_num_queries1=311,
+            expected_num_queries1=312,
             expected_num_async_tasks1=6,
-            expected_num_queries2=272,
+            expected_num_queries2=274,
             expected_num_async_tasks2=17,
-            expected_num_queries3=162,
+            expected_num_queries3=164,
             expected_num_async_tasks3=16,
         )
 
@@ -332,11 +332,11 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self
         self.system_settings(enable_product_grade=True)
 
         self._import_reimport_performance(
-            expected_num_queries1=347,
+            expected_num_queries1=348,
             expected_num_async_tasks1=8,
-            expected_num_queries2=281,
+            expected_num_queries2=283,
             expected_num_async_tasks2=19,
-            expected_num_queries3=169,
+            expected_num_queries3=171,
             expected_num_async_tasks3=18,
         )
 
@@ -355,11 +355,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
         self.system_settings(enable_product_grade=True)
 
         self._import_reimport_performance(
-            expected_num_queries1=313,
+            expected_num_queries1=314,
             expected_num_async_tasks1=8,
-            expected_num_queries2=274,
+            expected_num_queries2=276,
             expected_num_async_tasks2=19,
-            expected_num_queries3=164,
+            expected_num_queries3=166,
             expected_num_async_tasks3=18,
         )
 
@@ -519,9 +519,9 @@ def test_deduplication_performance_no_async(self):
         testuser.usercontactinfo.save()
 
         self._deduplication_performance(
-            expected_num_queries1=316,
+            expected_num_queries1=317,
             expected_num_async_tasks1=7,
-            expected_num_queries2=287,
+            expected_num_queries2=288,
             expected_num_async_tasks2=7,
         )
 
@@ -539,8 +539,8 @@ def test_deduplication_performance_pghistory_no_async(self):
         testuser.usercontactinfo.save()
 
         self._deduplication_performance(
-            expected_num_queries1=280,
+            expected_num_queries1=281,
             expected_num_async_tasks1=7,
-            expected_num_queries2=250,
+            expected_num_queries2=251,
             expected_num_async_tasks2=7,
         )

From 1e78c9a73430530c715aec331d040aa6385385a3 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 00:12:15 +0100
Subject: [PATCH 08/26] vulnerability_id processing: remove duplicate delete,
 fix bug, add tests

---
 dojo/finding/helper.py                        |   9 +-
 dojo/importers/base_importer.py               |  54 +++---
 dojo/importers/default_reimporter.py          |  17 +-
 ...k_all_fields_different_ids_fabricated.json | 167 ++++++++++++++++++
 .../check_all_fields_no_ids_fabricated.json   | 122 +++++++++++++
 .../check_all_fields_with_ids_fabricated.json | 158 +++++++++++++++++
 unittests/test_import_reimport.py             |  75 ++++++++
 unittests/test_importers_importer.py          |  85 ++++++++-
 8 files changed, 654 insertions(+), 33 deletions(-)
 create mode 100644 unittests/scans/anchore_grype/check_all_fields_different_ids_fabricated.json
 create mode 100644 unittests/scans/anchore_grype/check_all_fields_no_ids_fabricated.json
 create mode 100644 unittests/scans/anchore_grype/check_all_fields_with_ids_fabricated.json

diff --git a/dojo/finding/helper.py b/dojo/finding/helper.py
index 277609d3153..b48692a4ebd 100644
--- a/dojo/finding/helper.py
+++ b/dojo/finding/helper.py
@@ -762,12 +762,15 @@ def add_endpoints(new_finding, form):
             endpoint=endpoint, defaults={"date": form.cleaned_data["date"] or timezone.now()})
 
 
-def save_vulnerability_ids(finding, vulnerability_ids):
+def save_vulnerability_ids(finding, vulnerability_ids, *, delete_existing: bool = True):
     # Remove duplicates
     vulnerability_ids = list(dict.fromkeys(vulnerability_ids))
 
-    # Remove old vulnerability ids
-    Vulnerability_Id.objects.filter(finding=finding).delete()
+    # Remove old vulnerability ids if requested
+    # Callers can set delete_existing=False when they know there are no existing IDs
+    # to avoid an unnecessary delete query (e.g., for new findings)
+    if delete_existing:
+        Vulnerability_Id.objects.filter(finding=finding).delete()
 
     # Save new vulnerability ids
     # Using bulk create throws Django 50 warnings about unsaved models...
diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py
index ab10f1128ca..6933a2a28e4 100644
--- a/dojo/importers/base_importer.py
+++ b/dojo/importers/base_importer.py
@@ -31,7 +31,6 @@
     Test_Import,
     Test_Import_Finding_Action,
     Test_Type,
-    Vulnerability_Id,
 )
 from dojo.notifications.helper import create_notification
 from dojo.tag_utils import bulk_add_tags_to_instances
@@ -796,36 +795,41 @@ def process_vulnerability_ids(
         to create `Vulnerability_Id` objects with the finding associated correctly.
         Only updates if vulnerability_ids have changed to avoid unnecessary database operations.
 
+        Note: The finding parameter can be:
+        - A new finding (from import) with unsaved_vulnerability_ids set from the parsed report
+        - An existing finding (from reimport) with unsaved_vulnerability_ids copied from the
+          finding_from_report before calling this method
+
         Args:
-            finding: The finding to process vulnerability IDs for
+            finding: The finding to process vulnerability IDs for.
+                For reimports, this is the existing finding with unsaved_vulnerability_ids
+                set from the import/reimport report.
 
         Returns:
             The finding object
 
         """
-        if finding.unsaved_vulnerability_ids:
-            # Only check for changes if the finding has been saved and might have existing vulnerability_ids
-            # For new findings, we always need to save vulnerability_ids
-            if finding.pk and finding.vulnerability_id_set.exists():
-                # Check if vulnerability_ids have changed before updating
-                # Get existing vulnerability IDs from the database
-                existing_vuln_ids = set(finding.vulnerability_ids) if finding.vulnerability_ids else set()
-                # Normalize the new vulnerability IDs (remove duplicates for comparison)
-                new_vuln_ids = set(finding.unsaved_vulnerability_ids)
-
-                # Only update if vulnerability IDs have changed
-                if existing_vuln_ids == new_vuln_ids:
-                    logger.debug(
-                        f"Skipping vulnerability_ids update for finding {finding.id} - "
-                        f"vulnerability_ids unchanged: {sorted(existing_vuln_ids)}",
-                    )
-                    return finding
-
-            # Remove old vulnerability ids - keeping this call only because of flake8
-            Vulnerability_Id.objects.filter(finding=finding).delete()
-
-            # user the helper function
-            finding_helper.save_vulnerability_ids(finding, finding.unsaved_vulnerability_ids)
+        # Normalize to empty list if None - always process vulnerability IDs
+        vulnerability_ids_to_process = finding.unsaved_vulnerability_ids or []
+
+        # For existing findings, check if there are changes before updating
+        if finding.pk and len(finding.vulnerability_id_set.all()) > 0:
+            # Get existing vulnerability IDs from the database
+            existing_vuln_ids = set(finding.vulnerability_ids) if finding.vulnerability_ids else set()
+            # Normalize the new vulnerability IDs (remove duplicates for comparison)
+            new_vuln_ids = set(vulnerability_ids_to_process)
+
+            # Only update if vulnerability IDs have changed
+            if existing_vuln_ids == new_vuln_ids:
+                logger.debug(
+                    f"Skipping vulnerability_ids update for finding {finding.id} - "
+                    f"vulnerability_ids unchanged: {sorted(existing_vuln_ids)}",
+                )
+                return finding
+
+        # Use the helper function which handles deletion and creation
+        # Always use delete_existing=True - it's a no-op if there are no existing IDs
+        finding_helper.save_vulnerability_ids(finding, vulnerability_ids_to_process, delete_existing=True)
 
         return finding
 
diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
index 340f7c27cd0..1b8747f7083 100644
--- a/dojo/importers/default_reimporter.py
+++ b/dojo/importers/default_reimporter.py
@@ -851,7 +851,7 @@ def finding_post_processing(
         self,
         finding: Finding,
         finding_from_report: Finding,
-    ) -> None:
+    ) -> Finding:
         """
         Save all associated objects to the finding after it has been saved
         for the purpose of foreign key restrictions
@@ -871,10 +871,19 @@ def finding_post_processing(
             finding.unsaved_files = finding_from_report.unsaved_files
         self.process_files(finding)
         # Process vulnerability IDs
-        if finding_from_report.unsaved_vulnerability_ids:
-            finding.unsaved_vulnerability_ids = finding_from_report.unsaved_vulnerability_ids
+        # Copy unsaved_vulnerability_ids from the report finding to the existing finding
+        # so process_vulnerability_ids can process them (see its docstring for details)
+        # Always set it (even if empty list) so we can clear existing IDs when report has none
+        finding.unsaved_vulnerability_ids = finding_from_report.unsaved_vulnerability_ids or []
+        # Store the current cve value to check if it changes
+        old_cve = finding.cve
         # legacy cve field has already been processed/set earlier
-        return self.process_vulnerability_ids(finding)
+        finding = self.process_vulnerability_ids(finding)
+        # Save the finding only if the cve field was changed by save_vulnerability_ids
+        # This is temporary as the cve field will be phased out
+        if finding.cve != old_cve:
+            finding.save()
+        return finding
 
     def process_groups_for_all_findings(
         self,
diff --git a/unittests/scans/anchore_grype/check_all_fields_different_ids_fabricated.json b/unittests/scans/anchore_grype/check_all_fields_different_ids_fabricated.json
new file mode 100644
index 00000000000..8520e8828dc
--- /dev/null
+++ b/unittests/scans/anchore_grype/check_all_fields_different_ids_fabricated.json
@@ -0,0 +1,167 @@
+{
+    "matches": [
+     {
+      "vulnerability": {
+       "id": "GHSA-v6rh-hp5x-86rv",
+       "dataSource": "https://github.com/advisories/GHSA-v6rh-hp5x-86rv",
+       "namespace": "github:python",
+       "severity": "High",
+       "urls": [
+        "https://github.com/advisories/GHSA-v6rh-hp5x-86rv"
+       ],
+       "description": "Potential bypass of an upstream access control based on URL paths in Django",
+       "cvss": [],
+       "fix": {
+        "versions": [
+         "3.2.10"
+        ],
+        "state": "fixed"
+       },
+       "advisories": []
+      },
+      "relatedVulnerabilities": [
+       {
+        "id": "CVE-2021-1234",
+        "dataSource": "https://nvd.nist.gov/vuln/detail/CVE-2021-1234",
+        "namespace": "nvd",
+        "severity": "High",
+        "urls": [
+         "https://example.com/cve-2021-1234"
+        ],
+        "description": "A different CVE for testing vulnerability ID changes",
+        "cvss": [
+         {
+          "version": "3.1",
+          "vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H",
+          "metrics": {
+           "baseScore": 9.8,
+           "exploitabilityScore": 3.9,
+           "impactScore": 5.9
+          },
+          "vendorMetadata": {}
+         }
+        ]
+       },
+       {
+        "id": "CVE-2021-5678",
+        "dataSource": "https://nvd.nist.gov/vuln/detail/CVE-2021-5678",
+        "namespace": "nvd",
+        "severity": "Medium",
+        "urls": [
+         "https://example.com/cve-2021-5678"
+        ],
+        "description": "Another different CVE for testing vulnerability ID changes",
+        "cvss": [
+         {
+          "version": "3.1",
+          "vector": "CVSS:3.1/AV:N/AC:L/PR:L/UI:N/S:U/C:L/I:L/A:L",
+          "metrics": {
+           "baseScore": 6.3,
+           "exploitabilityScore": 3.9,
+           "impactScore": 2.4
+          },
+          "vendorMetadata": {}
+         }
+        ]
+       }
+      ],
+      "matchDetails": [
+       {
+        "matcher": "python-matcher",
+        "searchedBy": {
+         "language": "python",
+         "namespace": "github:python"
+        },
+        "found": {
+         "versionConstraint": ">=3.2,<3.2.10 (python)"
+        }
+       }
+      ],
+      "artifact": {
+       "name": "Django",
+       "version": "3.2.9",
+       "type": "python",
+       "locations": [
+        {
+         "path": "/usr/local/lib/python3.8/site-packages/Django-3.2.9.dist-info/METADATA",
+         "layerID": "sha256:b1d4455cf82b15a50b006fe87bd29f694c8f9155456253eb67fdd155b5edcf4a"
+        }
+       ],
+       "language": "python",
+       "licenses": [
+        "BSD-3-Clause"
+       ],
+       "cpes": [
+        "cpe:2.3:a:django_software_foundation:Django:3.2.9:*:*:*:*:*:*:*"
+       ],
+       "purl": "pkg:pypi/Django@3.2.9",
+       "metadata": null
+      }
+     }
+    ],
+    "source": {
+     "type": "image",
+     "target": {
+      "userInput": "vulnerable-image:latest",
+      "imageID": "sha256:ce9898fd214aef9c994a42624b09056bdce3ff4a8e3f68dc242d967b80fcbeee",
+      "manifestDigest": "sha256:9d8825ab20ac86b40eb71495bece1608a302fb180384740697a28c2b0a5a0fc6",
+      "mediaType": "application/vnd.docker.distribution.manifest.v2+json",
+      "tags": [
+       "vulnerable-image:latest"
+      ],
+      "imageSize": 707381791,
+      "layers": []
+     }
+    },
+    "distro": {
+     "name": "debian",
+     "version": "10",
+     "idLike": ""
+    },
+    "descriptor": {
+     "name": "grype",
+     "version": "0.28.0",
+     "configuration": {
+      "configPath": "",
+      "output": "json",
+      "file": "",
+      "output-template-file": "",
+      "quiet": false,
+      "check-for-app-update": true,
+      "only-fixed": false,
+      "scope": "Squashed",
+      "log": {
+       "structured": false,
+       "level": "",
+       "file": ""
+      },
+      "db": {
+       "cache-dir": "/home/user/.cache/grype/db",
+       "update-url": "https://toolbox-data.anchore.io/grype/databases/listing.json",
+       "ca-cert": "",
+       "auto-update": true,
+       "validate-by-hash-on-start": false
+      },
+      "dev": {
+       "profile-cpu": false,
+       "profile-mem": false
+      },
+      "fail-on-severity": "",
+      "registry": {
+       "insecure-skip-tls-verify": false,
+       "insecure-use-http": false,
+       "auth": []
+      },
+      "ignore": null,
+      "exclude": []
+     },
+     "db": {
+      "built": "2021-12-24T08:14:02Z",
+      "schemaVersion": 3,
+      "location": "/home/user/.cache/grype/db/3",
+      "checksum": "sha256:6c4777e1acea787e5335ccee6b5e4562cd1767b9cca138c07e0802efb2a74162",
+      "error": null
+     }
+    }
+   }
+
diff --git a/unittests/scans/anchore_grype/check_all_fields_no_ids_fabricated.json b/unittests/scans/anchore_grype/check_all_fields_no_ids_fabricated.json
new file mode 100644
index 00000000000..d4371d3a478
--- /dev/null
+++ b/unittests/scans/anchore_grype/check_all_fields_no_ids_fabricated.json
@@ -0,0 +1,122 @@
+{
+    "matches": [
+     {
+      "vulnerability": {
+       "id": "GHSA-v6rh-hp5x-86rv",
+       "dataSource": "https://github.com/advisories/GHSA-v6rh-hp5x-86rv",
+       "namespace": "github:python",
+       "severity": "High",
+       "urls": [
+        "https://github.com/advisories/GHSA-v6rh-hp5x-86rv"
+       ],
+       "description": "Potential bypass of an upstream access control based on URL paths in Django",
+       "cvss": [],
+       "fix": {
+        "versions": [
+         "3.2.10"
+        ],
+        "state": "fixed"
+       },
+       "advisories": []
+      },
+      "relatedVulnerabilities": [],
+      "matchDetails": [
+       {
+        "matcher": "python-matcher",
+        "searchedBy": {
+         "language": "python",
+         "namespace": "github:python"
+        },
+        "found": {
+         "versionConstraint": ">=3.2,<3.2.10 (python)"
+        }
+       }
+      ],
+      "artifact": {
+       "name": "Django",
+       "version": "3.2.9",
+       "type": "python",
+       "locations": [
+        {
+         "path": "/usr/local/lib/python3.8/site-packages/Django-3.2.9.dist-info/METADATA",
+         "layerID": "sha256:b1d4455cf82b15a50b006fe87bd29f694c8f9155456253eb67fdd155b5edcf4a"
+        }
+       ],
+       "language": "python",
+       "licenses": [
+        "BSD-3-Clause"
+       ],
+       "cpes": [
+        "cpe:2.3:a:django_software_foundation:Django:3.2.9:*:*:*:*:*:*:*"
+       ],
+       "purl": "pkg:pypi/Django@3.2.9",
+       "metadata": null
+      }
+     }
+    ],
+    "source": {
+     "type": "image",
+     "target": {
+      "userInput": "vulnerable-image:latest",
+      "imageID": "sha256:ce9898fd214aef9c994a42624b09056bdce3ff4a8e3f68dc242d967b80fcbeee",
+      "manifestDigest": "sha256:9d8825ab20ac86b40eb71495bece1608a302fb180384740697a28c2b0a5a0fc6",
+      "mediaType": "application/vnd.docker.distribution.manifest.v2+json",
+      "tags": [
+       "vulnerable-image:latest"
+      ],
+      "imageSize": 707381791,
+      "layers": []
+     }
+    },
+    "distro": {
+     "name": "debian",
+     "version": "10",
+     "idLike": ""
+    },
+    "descriptor": {
+     "name": "grype",
+     "version": "0.28.0",
+     "configuration": {
+      "configPath": "",
+      "output": "json",
+      "file": "",
+      "output-template-file": "",
+      "quiet": false,
+      "check-for-app-update": true,
+      "only-fixed": false,
+      "scope": "Squashed",
+      "log": {
+       "structured": false,
+       "level": "",
+       "file": ""
+      },
+      "db": {
+       "cache-dir": "/home/user/.cache/grype/db",
+       "update-url": "https://toolbox-data.anchore.io/grype/databases/listing.json",
+       "ca-cert": "",
+       "auto-update": true,
+       "validate-by-hash-on-start": false
+      },
+      "dev": {
+       "profile-cpu": false,
+       "profile-mem": false
+      },
+      "fail-on-severity": "",
+      "registry": {
+       "insecure-skip-tls-verify": false,
+       "insecure-use-http": false,
+       "auth": []
+      },
+      "ignore": null,
+      "exclude": []
+     },
+     "db": {
+      "built": "2021-12-24T08:14:02Z",
+      "schemaVersion": 3,
+      "location": "/home/user/.cache/grype/db/3",
+      "checksum": "sha256:6c4777e1acea787e5335ccee6b5e4562cd1767b9cca138c07e0802efb2a74162",
+      "error": null
+     }
+    }
+   }
+
diff --git a/unittests/scans/anchore_grype/check_all_fields_with_ids_fabricated.json b/unittests/scans/anchore_grype/check_all_fields_with_ids_fabricated.json
new file mode 100644
index 00000000000..028bd2c7b7b
--- /dev/null
+++ b/unittests/scans/anchore_grype/check_all_fields_with_ids_fabricated.json
@@ -0,0 +1,158 @@
+{
+    "matches": [
+     {
+      "vulnerability": {
+       "id": "GHSA-v6rh-hp5x-86rv",
+       "dataSource": "https://github.com/advisories/GHSA-v6rh-hp5x-86rv",
+       "namespace": "github:python",
+       "severity": "High",
+       "urls": [
+        "https://github.com/advisories/GHSA-v6rh-hp5x-86rv"
+       ],
+       "description": "Potential bypass of an upstream access control based on URL paths in Django",
+       "cvss": [],
+       "fix": {
+        "versions": [
+         "3.2.10"
+        ],
+        "state": "fixed"
+       },
+       "advisories": []
+      },
+      "relatedVulnerabilities": [
+       {
+        "id": "CVE-2021-44420",
+        "dataSource": "https://nvd.nist.gov/vuln/detail/CVE-2021-44420",
+        "namespace": "nvd",
+        "severity": "High",
+        "urls": [
+         "https://docs.djangoproject.com/en/3.2/releases/security/",
+         "https://www.openwall.com/lists/oss-security/2021/12/07/1",
+         "https://www.djangoproject.com/weblog/2021/dec/07/security-releases/",
+         "https://groups.google.com/forum/#!forum/django-announce"
+        ],
+        "description": "In Django 2.2 before 2.2.25, 3.1 before 3.1.14, and 3.2 before 3.2.10, HTTP requests for URLs with trailing newlines could bypass upstream access control based on URL paths.",
+        "cvss": [
+         {
+          "version": "2.0",
+          "vector": "AV:N/AC:L/Au:N/C:P/I:P/A:P",
+          "metrics": {
+           "baseScore": 7.5,
+           "exploitabilityScore": 10,
+           "impactScore": 6.4
+          },
+          "vendorMetadata": {}
+         },
+         {
+          "version": "3.1",
+          "vector": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:L/I:L/A:L",
+          "metrics": {
+           "baseScore": 7.3,
+           "exploitabilityScore": 3.9,
+           "impactScore": 3.4
+          },
+          "vendorMetadata": {}
+         }
+        ]
+       }
+      ],
+      "matchDetails": [
+       {
+        "matcher": "python-matcher",
+        "searchedBy": {
+         "language": "python",
+         "namespace": "github:python"
+        },
+        "found": {
+         "versionConstraint": ">=3.2,<3.2.10 (python)"
+        }
+       }
+      ],
+      "artifact": {
+       "name": "Django",
+       "version": "3.2.9",
+       "type": "python",
+       "locations": [
+        {
+         "path": "/usr/local/lib/python3.8/site-packages/Django-3.2.9.dist-info/METADATA",
+         "layerID": "sha256:b1d4455cf82b15a50b006fe87bd29f694c8f9155456253eb67fdd155b5edcf4a"
+        }
+       ],
+       "language": "python",
+       "licenses": [
+        "BSD-3-Clause"
+       ],
+       "cpes": [
+        "cpe:2.3:a:django_software_foundation:Django:3.2.9:*:*:*:*:*:*:*"
+       ],
+       "purl": "pkg:pypi/Django@3.2.9",
+       "metadata": null
+      }
+     }
+    ],
+    "source": {
+     "type": "image",
+     "target": {
+      "userInput": "vulnerable-image:latest",
+      "imageID": "sha256:ce9898fd214aef9c994a42624b09056bdce3ff4a8e3f68dc242d967b80fcbeee",
+      "manifestDigest": "sha256:9d8825ab20ac86b40eb71495bece1608a302fb180384740697a28c2b0a5a0fc6",
+      "mediaType": "application/vnd.docker.distribution.manifest.v2+json",
+      "tags": [
+       "vulnerable-image:latest"
+      ],
+      "imageSize": 707381791,
+      "layers": []
+     }
+    },
+    "distro": {
+     "name": "debian",
+     "version": "10",
+     "idLike": ""
+    },
+    "descriptor": {
+     "name": "grype",
+     "version": "0.28.0",
+     "configuration": {
+      "configPath": "",
+      "output": "json",
+      "file": "",
+      "output-template-file": "",
+      "quiet": false,
+      "check-for-app-update": true,
+      "only-fixed": false,
+      "scope": "Squashed",
+      "log": {
+       "structured": false,
+       "level": "",
+       "file": ""
+      },
+      "db": {
+       "cache-dir": "/home/user/.cache/grype/db",
+       "update-url": "https://toolbox-data.anchore.io/grype/databases/listing.json",
+       "ca-cert": "",
+       "auto-update": true,
+       "validate-by-hash-on-start": false
+      },
+      "dev": {
+       "profile-cpu": false,
+       "profile-mem": false
+      },
+      "fail-on-severity": "",
+      "registry": {
+       "insecure-skip-tls-verify": false,
+       "insecure-use-http": false,
+       "auth": []
+      },
+      "ignore": null,
+      "exclude": []
+     },
+     "db": {
+      "built": "2021-12-24T08:14:02Z",
+      "schemaVersion": 3,
+      "location": "/home/user/.cache/grype/db/3",
+      "checksum": "sha256:6c4777e1acea787e5335ccee6b5e4562cd1767b9cca138c07e0802efb2a74162",
+      "error": null
+     }
+    }
+   }
+
diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py
index 539f879adae..ed00e015189 100644
--- a/unittests/test_import_reimport.py
+++ b/unittests/test_import_reimport.py
@@ -102,6 +102,9 @@ def __init__(self, *args, **kwargs):
         self.anchore_grype_file_name = get_unit_tests_scans_path("anchore_grype") / "check_all_fields.json"
         self.anchore_grype_file_name_fix_not_available = get_unit_tests_scans_path("anchore_grype") / "fix_not_available.json"
         self.anchore_grype_file_name_fix_available = get_unit_tests_scans_path("anchore_grype") / "fix_available.json"
+        self.anchore_grype_file_name_with_ids_fabricated = get_unit_tests_scans_path("anchore_grype") / "check_all_fields_with_ids_fabricated.json"
+        self.anchore_grype_file_name_no_ids_fabricated = get_unit_tests_scans_path("anchore_grype") / "check_all_fields_no_ids_fabricated.json"
+        self.anchore_grype_file_name_different_ids_fabricated = get_unit_tests_scans_path("anchore_grype") / "check_all_fields_different_ids_fabricated.json"
         self.anchore_grype_scan_type = "Anchore Grype"
 
         self.checkmarx_one_open_and_false_positive = get_unit_tests_scans_path("checkmarx_one") / "one-open-one-false-positive.json"
@@ -1696,6 +1699,78 @@ def test_import_reimport_vulnerability_ids(self):
         self.assertEqual("GHSA-v6rh-hp5x-86rv", findings[3].vulnerability_ids[0])
         self.assertEqual("CVE-2021-44420", findings[3].vulnerability_ids[1])
 
+    def test_import_reimport_clear_vulnerability_ids(self):
+        """Test that vulnerability IDs are cleared when reimporting with no IDs"""
+        # Import scan with vulnerability IDs
+        import0 = self.import_scan_with_params(self.anchore_grype_file_name_with_ids_fabricated, scan_type=self.anchore_grype_scan_type)
+
+        test_id = import0["test"]
+        test = Test.objects.get(id=test_id)
+        findings = Finding.objects.filter(test=test)
+        self.assertEqual(1, len(findings))
+        finding = findings[0]
+        self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.cve)
+        self.assertEqual(2, len(finding.vulnerability_ids))
+        self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.vulnerability_ids[0])
+        self.assertEqual("CVE-2021-44420", finding.vulnerability_ids[1])
+
+        # Reimport with no vulnerability IDs - should clear existing IDs
+        test_type = Test_Type.objects.get(name=self.anchore_grype_scan_type)
+        reimport_test = Test(
+            engagement=test.engagement,
+            test_type=test_type,
+            scan_type=self.anchore_grype_scan_type,
+            target_start=datetime.now(timezone.get_current_timezone()),
+            target_end=datetime.now(timezone.get_current_timezone()),
+        )
+        reimport_test.save()
+
+        self.reimport_scan_with_params(reimport_test.id, self.anchore_grype_file_name_no_ids_fabricated, scan_type=self.anchore_grype_scan_type)
+        findings = Finding.objects.filter(test=reimport_test)
+        self.assertEqual(1, len(findings))
+        finding = findings[0]
+        # After clearing, only the primary vuln_id should remain (GHSA-v6rh-hp5x-86rv)
+        # because the parser always includes the primary vulnerability.id
+        self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.cve)
+        self.assertEqual(1, len(finding.vulnerability_ids))
+        self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.vulnerability_ids[0])
+
+    def test_import_reimport_change_vulnerability_ids(self):
+        """Test that vulnerability IDs are updated when reimporting with different IDs"""
+        # Import scan with initial vulnerability IDs
+        import0 = self.import_scan_with_params(self.anchore_grype_file_name_with_ids_fabricated, scan_type=self.anchore_grype_scan_type)
+
+        test_id = import0["test"]
+        test = Test.objects.get(id=test_id)
+        findings = Finding.objects.filter(test=test)
+        self.assertEqual(1, len(findings))
+        finding = findings[0]
+        self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.cve)
+        self.assertEqual(2, len(finding.vulnerability_ids))
+        self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.vulnerability_ids[0])
+        self.assertEqual("CVE-2021-44420", finding.vulnerability_ids[1])
+
+        # Reimport with different vulnerability IDs - should update existing IDs
+        test_type = Test_Type.objects.get(name=self.anchore_grype_scan_type)
+        reimport_test = Test(
+            engagement=test.engagement,
+            test_type=test_type,
+            scan_type=self.anchore_grype_scan_type,
+            target_start=datetime.now(timezone.get_current_timezone()),
+            target_end=datetime.now(timezone.get_current_timezone()),
+        )
+        reimport_test.save()
+
+        self.reimport_scan_with_params(reimport_test.id, self.anchore_grype_file_name_different_ids_fabricated, scan_type=self.anchore_grype_scan_type)
+        findings = Finding.objects.filter(test=reimport_test)
+        self.assertEqual(1, len(findings))
+        finding = findings[0]
+        self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.cve)
+        self.assertEqual(3, len(finding.vulnerability_ids))
+        self.assertEqual("GHSA-v6rh-hp5x-86rv", finding.vulnerability_ids[0])
+        self.assertEqual("CVE-2021-1234", finding.vulnerability_ids[1])
+        self.assertEqual("CVE-2021-5678", finding.vulnerability_ids[2])
+
     def test_import_reimport_fix_available(self):
         import0 = self.import_scan_with_params(self.anchore_grype_file_name_fix_not_available, scan_type=self.anchore_grype_scan_type)
         test_id = import0["test"]
diff --git a/unittests/test_importers_importer.py b/unittests/test_importers_importer.py
index cc5fb342df7..e2c8c8fc757 100644
--- a/unittests/test_importers_importer.py
+++ b/unittests/test_importers_importer.py
@@ -7,7 +7,16 @@
 from rest_framework.test import APIClient
 
 from dojo.importers.default_importer import DefaultImporter
-from dojo.models import Development_Environment, Engagement, Finding, Product, Product_Type, Test, User
+from dojo.models import (
+    Development_Environment,
+    Engagement,
+    Finding,
+    Product,
+    Product_Type,
+    Test,
+    User,
+    Vulnerability_Id,
+)
 from dojo.tools.gitlab_sast.parser import GitlabSastParser
 from dojo.tools.sarif.parser import SarifParser
 from dojo.utils import get_object_or_none
@@ -614,3 +623,77 @@ def test_no_handle_vulnerability_ids_references_and_no_cve(self, mock):
         self.assertEqual(finding.unsaved_vulnerability_ids, None)
         self.assertEqual(finding.vulnerability_ids, [])
         finding.delete()
+
+    def test_clear_vulnerability_ids_on_empty_list(self):
+        """Test that vulnerability IDs are cleared when an empty list is provided"""
+        # Create a finding with existing vulnerability IDs
+        finding = Finding()
+        finding.test = self.test
+        finding.reporter = self.testuser
+        finding.save()
+
+        # Add some vulnerability IDs
+        Vulnerability_Id.objects.create(finding=finding, vulnerability_id="CVE-2020-1234")
+        Vulnerability_Id.objects.create(finding=finding, vulnerability_id="CVE-2020-5678")
+        finding.cve = "CVE-2020-1234"
+        finding.save()
+
+        # Verify initial state
+        self.assertEqual(2, len(finding.vulnerability_ids))
+        self.assertEqual("CVE-2020-1234", finding.cve)
+
+        # Process with empty list - should clear all IDs
+        finding.unsaved_vulnerability_ids = []
+        DefaultImporter(**self.importer_data).process_vulnerability_ids(finding)
+        # Save the finding to persist the cve=None change
+        finding.save()
+
+        # Get fresh finding from database to avoid cached property issues
+        finding = Finding.objects.get(pk=finding.pk)
+
+        # Verify IDs are cleared
+        self.assertEqual(0, len(finding.vulnerability_ids))
+        self.assertEqual(None, finding.cve)
+        # Verify no Vulnerability_Id objects exist for this finding
+        self.assertEqual(0, Vulnerability_Id.objects.filter(finding=finding).count())
+        finding.delete()
+
+    def test_change_vulnerability_ids_on_reimport(self):
+        """Test that vulnerability IDs are updated when different IDs are provided"""
+        # Create a finding with existing vulnerability IDs
+        finding = Finding()
+        finding.test = self.test
+        finding.reporter = self.testuser
+        finding.save()
+
+        # Add initial vulnerability IDs
+        Vulnerability_Id.objects.create(finding=finding, vulnerability_id="CVE-2020-1234")
+        Vulnerability_Id.objects.create(finding=finding, vulnerability_id="CVE-2020-5678")
+        finding.cve = "CVE-2020-1234"
+        finding.save()
+
+        # Verify initial state
+        self.assertEqual(2, len(finding.vulnerability_ids))
+        self.assertEqual("CVE-2020-1234", finding.vulnerability_ids[0])
+        self.assertEqual("CVE-2020-5678", finding.vulnerability_ids[1])
+        self.assertEqual("CVE-2020-1234", finding.cve)
+
+        # Process with different IDs - should replace old IDs
+        new_vulnerability_ids = ["CVE-2021-9999", "GHSA-xxxx-yyyy"]
+        finding.unsaved_vulnerability_ids = new_vulnerability_ids
+        DefaultImporter(**self.importer_data).process_vulnerability_ids(finding)
+        # Save the finding to persist the cve change
+        finding.save()
+
+        # Get fresh finding from database to avoid cached property issues
+        finding = Finding.objects.get(pk=finding.pk)
+
+        # Verify old IDs are removed and new IDs are present
+        self.assertEqual(2, len(finding.vulnerability_ids))
+        self.assertEqual("CVE-2021-9999", finding.vulnerability_ids[0])
+        self.assertEqual("GHSA-xxxx-yyyy", finding.vulnerability_ids[1])
+        self.assertEqual("CVE-2021-9999", finding.cve)
+        # Verify only new Vulnerability_Id objects exist
+        vuln_ids = list(Vulnerability_Id.objects.filter(finding=finding).values_list("vulnerability_id", flat=True))
+        self.assertEqual(set(new_vulnerability_ids), set(vuln_ids))
+        finding.delete()

From 382fd216a3008f439ab4348f610311214a5d76d0 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 01:27:41 +0100
Subject: [PATCH 09/26] vulnerability_id processing: separate import/reimport

---
 dojo/importers/base_importer.py         | 38 ++++--------------------
 dojo/importers/default_importer.py      |  2 +-
 dojo/importers/default_reimporter.py    | 39 +++++++++++++++++++++++--
 unittests/test_importers_importer.py    | 25 +++++++---------
 unittests/test_importers_performance.py | 12 ++++----
 5 files changed, 60 insertions(+), 56 deletions(-)

diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py
index 6933a2a28e4..b295028ef27 100644
--- a/dojo/importers/base_importer.py
+++ b/dojo/importers/base_importer.py
@@ -786,51 +786,23 @@ def process_cve(
 
         return finding
 
-    def process_vulnerability_ids(
+    def store_vulnerability_ids(
         self,
         finding: Finding,
     ) -> Finding:
         """
-        Parse the `unsaved_vulnerability_ids` field from findings after they are parsed
-        to create `Vulnerability_Id` objects with the finding associated correctly.
-        Only updates if vulnerability_ids have changed to avoid unnecessary database operations.
-
-        Note: The finding parameter can be:
-        - A new finding (from import) with unsaved_vulnerability_ids set from the parsed report
-        - An existing finding (from reimport) with unsaved_vulnerability_ids copied from the
-          finding_from_report before calling this method
+        Store vulnerability IDs for a finding.
+        Reads from finding.unsaved_vulnerability_ids and saves them overwriting existing ones.
 
         Args:
-            finding: The finding to process vulnerability IDs for.
-                For reimports, this is the existing finding with unsaved_vulnerability_ids
-                set from the import/reimport report.
+            finding: The finding to store vulnerability IDs for
 
         Returns:
             The finding object
 
         """
-        # Normalize to empty list if None - always process vulnerability IDs
         vulnerability_ids_to_process = finding.unsaved_vulnerability_ids or []
-
-        # For existing findings, check if there are changes before updating
-        if finding.pk and len(finding.vulnerability_id_set.all()) > 0:
-            # Get existing vulnerability IDs from the database
-            existing_vuln_ids = set(finding.vulnerability_ids) if finding.vulnerability_ids else set()
-            # Normalize the new vulnerability IDs (remove duplicates for comparison)
-            new_vuln_ids = set(vulnerability_ids_to_process)
-
-            # Only update if vulnerability IDs have changed
-            if existing_vuln_ids == new_vuln_ids:
-                logger.debug(
-                    f"Skipping vulnerability_ids update for finding {finding.id} - "
-                    f"vulnerability_ids unchanged: {sorted(existing_vuln_ids)}",
-                )
-                return finding
-
-        # Use the helper function which handles deletion and creation
-        # Always use delete_existing=True - it's a no-op if there are no existing IDs
-        finding_helper.save_vulnerability_ids(finding, vulnerability_ids_to_process, delete_existing=True)
-
+        finding_helper.save_vulnerability_ids(finding, vulnerability_ids_to_process, delete_existing=False)
         return finding
 
     def process_files(
diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py
index 63f41b8f744..e1e3f6f4556 100644
--- a/dojo/importers/default_importer.py
+++ b/dojo/importers/default_importer.py
@@ -234,7 +234,7 @@ def process_findings(
             # Process any files
             self.process_files(finding)
             # Process vulnerability IDs
-            finding = self.process_vulnerability_ids(finding)
+            finding = self.store_vulnerability_ids(finding)
             # Categorize this finding as a new one
             new_findings.append(finding)
             # all data is already saved on the finding, we only need to trigger post processing in batches
diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
index 1b8747f7083..80aad31bab0 100644
--- a/dojo/importers/default_reimporter.py
+++ b/dojo/importers/default_reimporter.py
@@ -847,6 +847,41 @@ def process_finding_that_was_not_matched(
         self.process_request_response_pairs(unsaved_finding)
         return unsaved_finding
 
+    def reconcile_vulnerability_ids(
+        self,
+        finding: Finding,
+    ) -> Finding:
+        """
+        Reconcile vulnerability IDs for an existing finding.
+        Checks if IDs have changed before updating to avoid unnecessary database operations.
+        Uses prefetched data if available, otherwise fetches efficiently.
+
+        Args:
+            finding: The existing finding to reconcile vulnerability IDs for.
+                Must have unsaved_vulnerability_ids set.
+
+        Returns:
+            The finding object
+
+        """
+        vulnerability_ids_to_process = finding.unsaved_vulnerability_ids or []
+
+        # Use prefetched data directly without triggering queries
+        existing_vuln_ids = {v.vulnerability_id for v in finding.vulnerability_id_set.all()}
+        new_vuln_ids = set(vulnerability_ids_to_process)
+
+        # Early exit if unchanged
+        if existing_vuln_ids == new_vuln_ids:
+            logger.debug(
+                f"Skipping vulnerability_ids update for finding {finding.id} - "
+                f"vulnerability_ids unchanged: {sorted(existing_vuln_ids)}",
+            )
+            return finding
+
+        # Update if changed
+        finding_helper.save_vulnerability_ids(finding, vulnerability_ids_to_process, delete_existing=True)
+        return finding
+
     def finding_post_processing(
         self,
         finding: Finding,
@@ -872,13 +907,13 @@ def finding_post_processing(
         self.process_files(finding)
         # Process vulnerability IDs
         # Copy unsaved_vulnerability_ids from the report finding to the existing finding
-        # so process_vulnerability_ids can process them (see its docstring for details)
+        # so reconcile_vulnerability_ids can process them
         # Always set it (even if empty list) so we can clear existing IDs when report has none
         finding.unsaved_vulnerability_ids = finding_from_report.unsaved_vulnerability_ids or []
         # Store the current cve value to check if it changes
         old_cve = finding.cve
         # legacy cve field has already been processed/set earlier
-        finding = self.process_vulnerability_ids(finding)
+        finding = self.reconcile_vulnerability_ids(finding)
         # Save the finding only if the cve field was changed by save_vulnerability_ids
         # This is temporary as the cve field will be phased out
         if finding.cve != old_cve:
diff --git a/unittests/test_importers_importer.py b/unittests/test_importers_importer.py
index e2c8c8fc757..6e526ec0d92 100644
--- a/unittests/test_importers_importer.py
+++ b/unittests/test_importers_importer.py
@@ -7,6 +7,7 @@
 from rest_framework.test import APIClient
 
 from dojo.importers.default_importer import DefaultImporter
+from dojo.importers.default_reimporter import DefaultReImporter
 from dojo.models import (
     Development_Environment,
     Engagement,
@@ -562,8 +563,7 @@ def create_default_data(self):
             "scan_type": NPM_AUDIT_SCAN_TYPE,
         }
 
-    @patch("dojo.importers.base_importer.Vulnerability_Id", autospec=True)
-    def test_handle_vulnerability_ids_references_and_cve(self, mock):
+    def test_handle_vulnerability_ids_references_and_cve(self):
         # Why doesn't this test use the test db and query for one?
         vulnerability_ids = ["CVE", "REF-1", "REF-2"]
         finding = Finding()
@@ -571,7 +571,7 @@ def test_handle_vulnerability_ids_references_and_cve(self, mock):
         finding.test = self.test
         finding.reporter = self.testuser
         finding.save()
-        DefaultImporter(**self.importer_data).process_vulnerability_ids(finding)
+        DefaultImporter(**self.importer_data).store_vulnerability_ids(finding)
 
         self.assertEqual("CVE", finding.vulnerability_ids[0])
         self.assertEqual("CVE", finding.cve)
@@ -580,8 +580,7 @@ def test_handle_vulnerability_ids_references_and_cve(self, mock):
         self.assertEqual("REF-2", finding.vulnerability_ids[2])
         finding.delete()
 
-    @patch("dojo.importers.base_importer.Vulnerability_Id", autospec=True)
-    def test_handle_no_vulnerability_ids_references_and_cve(self, mock):
+    def test_handle_no_vulnerability_ids_references_and_cve(self):
         vulnerability_ids = ["CVE"]
         finding = Finding()
         finding.test = self.test
@@ -589,22 +588,21 @@ def test_handle_no_vulnerability_ids_references_and_cve(self, mock):
         finding.save()
         finding.unsaved_vulnerability_ids = vulnerability_ids
 
-        DefaultImporter(**self.importer_data).process_vulnerability_ids(finding)
+        DefaultImporter(**self.importer_data).store_vulnerability_ids(finding)
 
         self.assertEqual("CVE", finding.vulnerability_ids[0])
         self.assertEqual("CVE", finding.cve)
         self.assertEqual(vulnerability_ids, finding.unsaved_vulnerability_ids)
         finding.delete()
 
-    @patch("dojo.importers.base_importer.Vulnerability_Id", autospec=True)
-    def test_handle_vulnerability_ids_references_and_no_cve(self, mock):
+    def test_handle_vulnerability_ids_references_and_no_cve(self):
         vulnerability_ids = ["REF-1", "REF-2"]
         finding = Finding()
         finding.test = self.test
         finding.reporter = self.testuser
         finding.save()
         finding.unsaved_vulnerability_ids = vulnerability_ids
-        DefaultImporter(**self.importer_data).process_vulnerability_ids(finding)
+        DefaultImporter(**self.importer_data).store_vulnerability_ids(finding)
 
         self.assertEqual("REF-1", finding.vulnerability_ids[0])
         self.assertEqual("REF-1", finding.cve)
@@ -612,13 +610,12 @@ def test_handle_vulnerability_ids_references_and_no_cve(self, mock):
         self.assertEqual("REF-2", finding.vulnerability_ids[1])
         finding.delete()
 
-    @patch("dojo.importers.base_importer.Vulnerability_Id", autospec=True)
-    def test_no_handle_vulnerability_ids_references_and_no_cve(self, mock):
+    def test_no_handle_vulnerability_ids_references_and_no_cve(self):
         finding = Finding()
         finding.test = self.test
         finding.reporter = self.testuser
         finding.save()
-        DefaultImporter(**self.importer_data).process_vulnerability_ids(finding)
+        DefaultImporter(**self.importer_data).store_vulnerability_ids(finding)
         self.assertEqual(finding.cve, None)
         self.assertEqual(finding.unsaved_vulnerability_ids, None)
         self.assertEqual(finding.vulnerability_ids, [])
@@ -644,7 +641,7 @@ def test_clear_vulnerability_ids_on_empty_list(self):
 
         # Process with empty list - should clear all IDs
         finding.unsaved_vulnerability_ids = []
-        DefaultImporter(**self.importer_data).process_vulnerability_ids(finding)
+        DefaultReImporter(test=self.test, environment=self.importer_data["environment"], scan_type=self.importer_data["scan_type"]).reconcile_vulnerability_ids(finding)
         # Save the finding to persist the cve=None change
         finding.save()
 
@@ -681,7 +678,7 @@ def test_change_vulnerability_ids_on_reimport(self):
         # Process with different IDs - should replace old IDs
         new_vulnerability_ids = ["CVE-2021-9999", "GHSA-xxxx-yyyy"]
         finding.unsaved_vulnerability_ids = new_vulnerability_ids
-        DefaultImporter(**self.importer_data).process_vulnerability_ids(finding)
+        DefaultReImporter(test=self.test, environment=self.importer_data["environment"], scan_type=self.importer_data["scan_type"]).reconcile_vulnerability_ids(finding)
         # Save the finding to persist the cve change
         finding.save()
 
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
index c422140a433..1667ac7c04d 100644
--- a/unittests/test_importers_performance.py
+++ b/unittests/test_importers_performance.py
@@ -244,7 +244,7 @@ def test_import_reimport_reimport_performance_async(self):
         self._import_reimport_performance(
             expected_num_queries1=340,
             expected_num_async_tasks1=7,
-            expected_num_queries2=275,
+            expected_num_queries2=276,
             expected_num_async_tasks2=18,
             expected_num_queries3=163,
             expected_num_async_tasks3=17,
@@ -262,7 +262,7 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
         self._import_reimport_performance(
             expected_num_queries1=306,
             expected_num_async_tasks1=7,
-            expected_num_queries2=268,
+            expected_num_queries2=269,
             expected_num_async_tasks2=18,
             expected_num_queries3=158,
             expected_num_async_tasks3=17,
@@ -286,7 +286,7 @@ def test_import_reimport_reimport_performance_no_async(self):
         self._import_reimport_performance(
             expected_num_queries1=346,
             expected_num_async_tasks1=6,
-            expected_num_queries2=281,
+            expected_num_queries2=282,
             expected_num_async_tasks2=17,
             expected_num_queries3=169,
             expected_num_async_tasks3=16,
@@ -308,7 +308,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
         self._import_reimport_performance(
             expected_num_queries1=312,
             expected_num_async_tasks1=6,
-            expected_num_queries2=274,
+            expected_num_queries2=275,
             expected_num_async_tasks2=17,
             expected_num_queries3=164,
             expected_num_async_tasks3=16,
@@ -334,7 +334,7 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self
         self._import_reimport_performance(
             expected_num_queries1=348,
             expected_num_async_tasks1=8,
-            expected_num_queries2=283,
+            expected_num_queries2=284,
             expected_num_async_tasks2=19,
             expected_num_queries3=171,
             expected_num_async_tasks3=18,
@@ -357,7 +357,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
         self._import_reimport_performance(
             expected_num_queries1=314,
             expected_num_async_tasks1=8,
-            expected_num_queries2=276,
+            expected_num_queries2=277,
             expected_num_async_tasks2=19,
             expected_num_queries3=166,
             expected_num_async_tasks3=18,

From 406b69869239954162452e9de58c918f8a0f5f8e Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sat, 13 Dec 2025 19:38:54 +0100
Subject: [PATCH 10/26] reimport: add management command to reimport sample
 scans

---
 .../commands/reimport_unittest_scan.py        | 228 ++++++++++++++++++
 1 file changed, 228 insertions(+)
 create mode 100644 dojo/management/commands/reimport_unittest_scan.py

diff --git a/dojo/management/commands/reimport_unittest_scan.py b/dojo/management/commands/reimport_unittest_scan.py
new file mode 100644
index 00000000000..8480962b3bb
--- /dev/null
+++ b/dojo/management/commands/reimport_unittest_scan.py
@@ -0,0 +1,228 @@
+import json
+import logging
+import time
+from importlib import import_module
+from importlib.util import find_spec
+from inspect import isclass
+from pathlib import Path
+
+from django.core.management.base import BaseCommand, CommandError
+from django.urls import reverse
+from rest_framework.authtoken.models import Token
+from rest_framework.test import APIClient
+
+from unittests.test_dashboard import User
+
+logger = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+
+    help = (
+        "Reimport a specific unittest scan by filename into an existing test. "
+        "Automatically deduces scan type from path."
+    )
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "test_id",
+            type=int,
+            help="ID of the test to reimport into",
+        )
+        parser.add_argument(
+            "scan_file",
+            type=str,
+            help="Path to scan file relative to unittests/scans/ (e.g., 'jfrog_xray_unified/very_many_vulns.json')",
+        )
+        parser.add_argument(
+            "--minimum-severity",
+            type=str,
+            default="Low",
+            choices=["Critical", "High", "Medium", "Low", "Info"],
+            help="Minimum severity to import (default: Low)",
+        )
+        parser.add_argument(
+            "--active",
+            action="store_true",
+            default=True,
+            help="Mark findings as active (default: True)",
+        )
+        parser.add_argument(
+            "--verified",
+            action="store_true",
+            default=False,
+            help="Mark findings as verified (default: False)",
+        )
+        parser.add_argument(
+            "--close-old-findings",
+            action="store_true",
+            default=False,
+            help="Close findings not present in the new scan (default: False)",
+        )
+        parser.add_argument(
+            "--tags",
+            action="append",
+            default=[],
+            help=(
+                "Tag(s) to apply to the test (repeat --tags to add multiple). "
+                "Example: --tags perf --tags jfrog"
+            ),
+        )
+
+    def get_test_admin(self):
+        return User.objects.get(username="admin")
+
+    def reimport_scan(self, payload, expected_http_status_code=201):
+        testuser = self.get_test_admin()
+        token = Token.objects.get(user=testuser)
+        client = APIClient()
+        client.credentials(HTTP_AUTHORIZATION="Token " + token.key)
+
+        response = client.post(reverse("reimportscan-list"), payload)
+        if expected_http_status_code != response.status_code:
+            msg = f"Expected HTTP status code {expected_http_status_code}, got {response.status_code}: {response.content[:1000]}"
+            raise CommandError(msg)
+        return json.loads(response.content)
+
+    def deduce_scan_type_from_path(self, scan_file_path):
+        """
+        Deduce the scan type from the file path by finding the corresponding parser.
+
+        Args:
+            scan_file_path: Path like 'zap/zap_sample.json' or 'jfrog_xray_unified/very_many_vulns.json'
+
+        Returns:
+            tuple: (scan_type, parser_class) or raises CommandError if not found
+
+        """
+        # Extract the directory name (parser module name)
+        path_parts = Path(scan_file_path).parts
+        if len(path_parts) < 2:
+            msg = f"Scan file path must include directory: {scan_file_path}"
+            raise CommandError(msg)
+
+        module_name = path_parts[0]
+
+        # Try to find and load the parser module
+        try:
+            if not find_spec(f"dojo.tools.{module_name}.parser"):
+                msg = f"No parser module found for '{module_name}'"
+                raise CommandError(msg)
+
+            module = import_module(f"dojo.tools.{module_name}.parser")
+
+            # Find the parser class
+            parser_class = None
+            expected_class_name = module_name.replace("_", "") + "parser"
+
+            for attribute_name in dir(module):
+                attribute = getattr(module, attribute_name)
+                if isclass(attribute) and attribute_name.lower() == expected_class_name:
+                    parser_class = attribute
+                    break
+
+            if not parser_class:
+                msg = f"No parser class found in module '{module_name}'"
+                raise CommandError(msg)
+
+            # Get the scan type from the parser
+            parser_instance = parser_class()
+            scan_types = parser_instance.get_scan_types()
+
+            if not scan_types:
+                msg = f"Parser '{module_name}' has no scan types"
+                raise CommandError(msg)
+
+            return scan_types[0], parser_class
+
+        except ImportError as e:
+            msg = f"Failed to import parser module '{module_name}': {e}"
+            raise CommandError(msg)
+
+    def reimport_unittest_scan(self, test_id, scan_file, minimum_severity, active, verified, close_old_findings, tags):
+        """
+        Reimport a specific unittest scan file into an existing test.
+
+        Args:
+            test_id: ID of the test to reimport into
+            scan_file: Path to scan file relative to unittests/scans/
+            minimum_severity: Minimum severity level
+            active: Whether findings should be active
+            verified: Whether findings should be verified
+            close_old_findings: Whether to close findings not in the new scan
+            tags: List of tags to apply
+
+        """
+        # Validate scan file exists
+        scan_path = Path("unittests/scans") / scan_file
+        if not scan_path.exists():
+            msg = f"Scan file not found: {scan_path}"
+            raise CommandError(msg)
+
+        # Deduce scan type from path
+        scan_type, _parser_class = self.deduce_scan_type_from_path(scan_file)
+
+        logger.info(f"Reimporting scan '{scan_file}' using scan type '{scan_type}'")
+        logger.info(f"Target: Test ID {test_id}")
+
+        # Reimport the scan
+        with scan_path.open(encoding="utf-8") as testfile:
+            payload = {
+                "test": test_id,
+                "minimum_severity": minimum_severity,
+                "scan_type": scan_type,
+                "file": testfile,
+                "version": "1.0.1",
+                "active": active,
+                "verified": verified,
+                "close_old_findings": close_old_findings,
+            }
+
+            if tags:
+                payload["tags"] = tags
+
+            result = self.reimport_scan(payload)
+
+        logger.info(f"Successfully reimported scan. Test ID: {result.get('test')}")
+        logger.info(f"Reimport summary: {result.get('scan_save_message', 'No summary available')}")
+
+        return result
+
+    def handle(self, *args, **options):
+        test_id = options["test_id"]
+        scan_file = options["scan_file"]
+        minimum_severity = options["minimum_severity"]
+        active = options["active"]
+        verified = options["verified"]
+        close_old_findings = options["close_old_findings"]
+        tags = options["tags"]
+
+        start_time = time.time()
+
+        try:
+            self.reimport_unittest_scan(
+                test_id=test_id,
+                scan_file=scan_file,
+                minimum_severity=minimum_severity,
+                active=active,
+                verified=verified,
+                close_old_findings=close_old_findings,
+                tags=tags,
+            )
+
+            end_time = time.time()
+            duration = end_time - start_time
+
+            self.stdout.write(
+                self.style.SUCCESS(
+                    f"Successfully reimported '{scan_file}' into test ID {test_id} "
+                    f"(took {duration:.2f} seconds)",
+                ),
+            )
+
+        except Exception as e:
+            end_time = time.time()
+            duration = end_time - start_time
+            logger.exception(f"Failed to reimport scan '{scan_file}' after {duration:.2f} seconds")
+            msg = f"Reimport failed after {duration:.2f} seconds: {e}"
+            raise CommandError(msg)

From 9cef33add90bbff155d495167917b00817d5e853 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 02:10:54 +0100
Subject: [PATCH 11/26] reimport: prefetch endpoint status

---
 dojo/finding/deduplication.py           | 12 +++++++++++-
 unittests/test_importers_performance.py | 24 ++++++++++++------------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py
index b397add4efd..10c5bf9e10f 100644
--- a/dojo/finding/deduplication.py
+++ b/dojo/finding/deduplication.py
@@ -252,10 +252,20 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None):
             )
         queryset = Finding.objects.filter(scope_q)
 
+    # Base prefetches for both modes
+    prefetch_list = ["endpoints", "vulnerability_id_set"]
+
+    # Additional prefetches for reimport mode
+    if mode == "reimport":
+        prefetch_list.extend([
+            "status_finding",
+            "status_finding__endpoint",
+        ])
+
     return (
         queryset
         .select_related("test", "test__engagement", "test__test_type")
-        .prefetch_related("endpoints", "vulnerability_id_set")
+        .prefetch_related(*prefetch_list)
     )
 
 
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
index 1667ac7c04d..b94cd0d2277 100644
--- a/unittests/test_importers_performance.py
+++ b/unittests/test_importers_performance.py
@@ -244,9 +244,9 @@ def test_import_reimport_reimport_performance_async(self):
         self._import_reimport_performance(
             expected_num_queries1=340,
             expected_num_async_tasks1=7,
-            expected_num_queries2=276,
+            expected_num_queries2=238,
             expected_num_async_tasks2=18,
-            expected_num_queries3=163,
+            expected_num_queries3=120,
             expected_num_async_tasks3=17,
         )
 
@@ -262,9 +262,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
         self._import_reimport_performance(
             expected_num_queries1=306,
             expected_num_async_tasks1=7,
-            expected_num_queries2=269,
+            expected_num_queries2=231,
             expected_num_async_tasks2=18,
-            expected_num_queries3=158,
+            expected_num_queries3=115,
             expected_num_async_tasks3=17,
         )
 
@@ -286,9 +286,9 @@ def test_import_reimport_reimport_performance_no_async(self):
         self._import_reimport_performance(
             expected_num_queries1=346,
             expected_num_async_tasks1=6,
-            expected_num_queries2=282,
+            expected_num_queries2=244,
             expected_num_async_tasks2=17,
-            expected_num_queries3=169,
+            expected_num_queries3=126,
             expected_num_async_tasks3=16,
         )
 
@@ -308,9 +308,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
         self._import_reimport_performance(
             expected_num_queries1=312,
             expected_num_async_tasks1=6,
-            expected_num_queries2=275,
+            expected_num_queries2=237,
             expected_num_async_tasks2=17,
-            expected_num_queries3=164,
+            expected_num_queries3=121,
             expected_num_async_tasks3=16,
         )
 
@@ -334,9 +334,9 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self
         self._import_reimport_performance(
             expected_num_queries1=348,
             expected_num_async_tasks1=8,
-            expected_num_queries2=284,
+            expected_num_queries2=246,
             expected_num_async_tasks2=19,
-            expected_num_queries3=171,
+            expected_num_queries3=128,
             expected_num_async_tasks3=18,
         )
 
@@ -357,9 +357,9 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
         self._import_reimport_performance(
             expected_num_queries1=314,
             expected_num_async_tasks1=8,
-            expected_num_queries2=277,
+            expected_num_queries2=239,
             expected_num_async_tasks2=19,
-            expected_num_queries3=166,
+            expected_num_queries3=123,
             expected_num_async_tasks3=18,
         )
 

From 9464ce46b385d01e6a76a7fb635d151da21b7c73 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 10:40:15 +0100
Subject: [PATCH 12/26] reimport: bullk update endpoint statuses

---
 dojo/importers/endpoint_manager.py      | 43 +++++++++++++++++--------
 unittests/test_importers_performance.py | 12 +++----
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/dojo/importers/endpoint_manager.py b/dojo/importers/endpoint_manager.py
index ccfff345c40..f4b277d49fa 100644
--- a/dojo/importers/endpoint_manager.py
+++ b/dojo/importers/endpoint_manager.py
@@ -68,6 +68,7 @@ def mitigate_endpoint_status(
     ) -> None:
         """Mitigates all endpoint status objects that are supplied"""
         now = timezone.now()
+        to_update = []
         for endpoint_status in endpoint_status_list:
             # Only mitigate endpoints that are actually active
             if endpoint_status.mitigated is False:
@@ -75,7 +76,14 @@ def mitigate_endpoint_status(
                 endpoint_status.last_modified = now
                 endpoint_status.mitigated_by = user
                 endpoint_status.mitigated = True
-                endpoint_status.save()
+                to_update.append(endpoint_status)
+
+        if to_update:
+            Endpoint_Status.objects.bulk_update(
+                to_update,
+                ["mitigated", "mitigated_time", "last_modified", "mitigated_by"],
+                batch_size=1000,
+            )
 
     @dojo_async_task
     @app.task()
@@ -85,6 +93,8 @@ def reactivate_endpoint_status(
         **kwargs: dict,
     ) -> None:
         """Reactivate all endpoint status objects that are supplied"""
+        now = timezone.now()
+        to_update = []
         for endpoint_status in endpoint_status_list:
             # Only reactivate endpoints that are actually mitigated
             if endpoint_status.mitigated:
@@ -92,8 +102,15 @@ def reactivate_endpoint_status(
                 endpoint_status.mitigated_by = None
                 endpoint_status.mitigated_time = None
                 endpoint_status.mitigated = False
-                endpoint_status.last_modified = timezone.now()
-                endpoint_status.save()
+                endpoint_status.last_modified = now
+                to_update.append(endpoint_status)
+
+        if to_update:
+            Endpoint_Status.objects.bulk_update(
+                to_update,
+                ["mitigated", "mitigated_time", "mitigated_by", "last_modified"],
+                batch_size=1000,
+            )
 
     def chunk_endpoints_and_disperse(
         self,
@@ -149,17 +166,17 @@ def update_endpoint_status(
             # New finding is mitigated, so mitigate all old endpoints
             endpoint_status_to_mitigate = existing_finding_endpoint_status_list
         else:
+            # Convert to set for O(1) lookups instead of O(n) linear search
+            new_finding_endpoints_set = set(new_finding_endpoints_list)
             # Mitigate any endpoints in the old finding not found in the new finding
-            endpoint_status_to_mitigate = list(
-                filter(
-                    lambda existing_finding_endpoint_status: existing_finding_endpoint_status.endpoint not in new_finding_endpoints_list,
-                    existing_finding_endpoint_status_list),
-            )
+            endpoint_status_to_mitigate = [
+                eps for eps in existing_finding_endpoint_status_list
+                if eps.endpoint not in new_finding_endpoints_set
+            ]
             # Re-activate any endpoints in the old finding that are in the new finding
-            endpoint_status_to_reactivate = list(
-                filter(
-                    lambda existing_finding_endpoint_status: existing_finding_endpoint_status.endpoint in new_finding_endpoints_list,
-                    existing_finding_endpoint_status_list),
-            )
+            endpoint_status_to_reactivate = [
+                eps for eps in existing_finding_endpoint_status_list
+                if eps.endpoint in new_finding_endpoints_set
+            ]
             self.chunk_endpoints_and_reactivate(endpoint_status_to_reactivate)
         self.chunk_endpoints_and_mitigate(endpoint_status_to_mitigate, user)
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
index b94cd0d2277..5f1af63f1ab 100644
--- a/unittests/test_importers_performance.py
+++ b/unittests/test_importers_performance.py
@@ -246,7 +246,7 @@ def test_import_reimport_reimport_performance_async(self):
             expected_num_async_tasks1=7,
             expected_num_queries2=238,
             expected_num_async_tasks2=18,
-            expected_num_queries3=120,
+            expected_num_queries3=118,
             expected_num_async_tasks3=17,
         )
 
@@ -264,7 +264,7 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
             expected_num_async_tasks1=7,
             expected_num_queries2=231,
             expected_num_async_tasks2=18,
-            expected_num_queries3=115,
+            expected_num_queries3=113,
             expected_num_async_tasks3=17,
         )
 
@@ -288,7 +288,7 @@ def test_import_reimport_reimport_performance_no_async(self):
             expected_num_async_tasks1=6,
             expected_num_queries2=244,
             expected_num_async_tasks2=17,
-            expected_num_queries3=126,
+            expected_num_queries3=124,
             expected_num_async_tasks3=16,
         )
 
@@ -310,7 +310,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
             expected_num_async_tasks1=6,
             expected_num_queries2=237,
             expected_num_async_tasks2=17,
-            expected_num_queries3=121,
+            expected_num_queries3=119,
             expected_num_async_tasks3=16,
         )
 
@@ -336,7 +336,7 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self
             expected_num_async_tasks1=8,
             expected_num_queries2=246,
             expected_num_async_tasks2=19,
-            expected_num_queries3=128,
+            expected_num_queries3=126,
             expected_num_async_tasks3=18,
         )
 
@@ -359,7 +359,7 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
             expected_num_async_tasks1=8,
             expected_num_queries2=239,
             expected_num_async_tasks2=19,
-            expected_num_queries3=123,
+            expected_num_queries3=121,
             expected_num_async_tasks3=18,
         )
 

From e81d1decb56cfc3424a3840f3a9deb146042d65c Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 11:21:47 +0100
Subject: [PATCH 13/26] add script to update performance test counts easily

---
 scripts/update_performance_test_counts.py | 324 ++++++++++++++++++++++
 unittests/test_importers_performance.py   |  20 ++
 2 files changed, 344 insertions(+)
 create mode 100644 scripts/update_performance_test_counts.py

diff --git a/scripts/update_performance_test_counts.py b/scripts/update_performance_test_counts.py
new file mode 100644
index 00000000000..f63e040fb3a
--- /dev/null
+++ b/scripts/update_performance_test_counts.py
@@ -0,0 +1,324 @@
+#!/usr/bin/env python3
+"""
+Script to update performance test query counts.
+I tried to implement this as management command, but it would become very slow on parsing the test output.
+
+This script:
+1. Runs all performance tests and captures actual query counts
+2. Compares them with expected counts
+3. Generates a report and optionally updates the test file
+4. Provides a verification run
+
+How to run:
+
+    # Step 1: Run tests and generate report (uses TestDojoImporterPerformanceSmall by default)
+    python3 scripts/update_performance_test_counts.py --report-only
+
+    # Or specify a different test class:
+    python3 scripts/update_performance_test_counts.py --test-class TestDojoImporterPerformanceSmall --report-only
+
+    # Step 2: Review the report, then update the file
+    python3 scripts/update_performance_test_counts.py --update
+
+    # Step 3: Verify all tests pass
+    python3 scripts/update_performance_test_counts.py --verify
+
+The script defaults to TestDojoImporterPerformanceSmall if --test-class is not provided.
+"""
+
+import argparse
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+# Path to the test file
+TEST_FILE = Path(__file__).parent.parent / "unittests" / "test_importers_performance.py"
+
+
+class TestCount:
+
+    """Represents a test's expected and actual counts."""
+
+    def __init__(self, test_name: str, step: str, metric: str):
+        self.test_name = test_name
+        self.step = step
+        self.metric = metric
+        self.expected = None
+        self.actual = None
+        self.difference = None
+
+    def __repr__(self):
+        return (
+            f"TestCount({self.test_name}, {self.step}, {self.metric}, "
+            f"expected={self.expected}, actual={self.actual})"
+        )
+
+
+def run_tests(test_class: str) -> str:
+    """Run the performance tests and return the output."""
+    print(f"Running tests for {test_class}...")
+    cmd = [
+        "./run-unittest.sh",
+        "--test-case",
+        f"unittests.test_importers_performance.{test_class}",
+    ]
+    result = subprocess.run(
+        cmd, check=False, capture_output=True, text=True, cwd=Path(__file__).parent.parent,
+    )
+    return result.stdout + result.stderr
+
+
+def parse_test_output(output: str) -> list[TestCount]:
+    """Parse test output to extract actual vs expected counts."""
+    counts = []
+
+    # Pattern to match AssertionError with query counts
+    # Format: AssertionError: 118 != 120 : 118 queries executed, 120 expected
+    assertion_pattern = re.compile(
+        r"AssertionError: (\d+) != (\d+) : (\d+) queries executed, (\d+) expected",
+    )
+
+    # Pattern to match test name and step
+    # Format: (step='reimport2', metric='queries')
+    step_pattern = re.compile(r"step='(\w+)', metric='(\w+)'")
+
+    # Pattern to match test name
+    # Format: FAIL: test_import_reimport_reimport_performance_async
+    test_pattern = re.compile(r"FAIL: (test_\w+)")
+
+    lines = output.split("\n")
+    current_test = None
+    current_step = None
+    current_metric = None
+
+    for line in lines:
+        # Match test name
+        test_match = test_pattern.search(line)
+        if test_match:
+            current_test = test_match.group(1)
+
+        # Match step and metric
+        step_match = step_pattern.search(line)
+        if step_match:
+            current_step = step_match.group(1)
+            current_metric = step_match.group(2)
+
+        # Match assertion error
+        assertion_match = assertion_pattern.search(line)
+        if assertion_match and current_test and current_step and current_metric:
+            actual = int(assertion_match.group(1))
+            expected = int(assertion_match.group(2))
+            count = TestCount(current_test, current_step, current_metric)
+            count.actual = actual
+            count.expected = expected
+            count.difference = expected - actual
+            counts.append(count)
+
+    return counts
+
+
+def extract_expected_counts_from_file(test_class: str) -> dict[str, dict[str, int]]:
+    """Extract expected counts from the test file."""
+    if not TEST_FILE.exists():
+        msg = f"Test file not found: {TEST_FILE}"
+        raise FileNotFoundError(msg)
+
+    content = TEST_FILE.read_text()
+
+    # Pattern to match test method calls with expected counts
+    # Format: self._import_reimport_performance(
+    #             expected_num_queries1=340,
+    #             expected_num_async_tasks1=7,
+    #             expected_num_queries2=238,
+    #             expected_num_async_tasks2=18,
+    #             expected_num_queries3=120,
+    #             expected_num_async_tasks3=17,
+    #         )
+    # More flexible pattern that handles whitespace variations
+    pattern = re.compile(
+        r"def (test_\w+)\([^)]*\):.*?"
+        r"self\._import_reimport_performance\(\s*"
+        r"expected_num_queries1\s*=\s*(\d+)\s*,\s*"
+        r"expected_num_async_tasks1\s*=\s*(\d+)\s*,\s*"
+        r"expected_num_queries2\s*=\s*(\d+)\s*,\s*"
+        r"expected_num_async_tasks2\s*=\s*(\d+)\s*,\s*"
+        r"expected_num_queries3\s*=\s*(\d+)\s*,\s*"
+        r"expected_num_async_tasks3\s*=\s*(\d+)\s*,",
+        re.DOTALL,
+    )
+
+    expected_counts = {}
+    for match in pattern.finditer(content):
+        test_name = match.group(1)
+        expected_counts[test_name] = {
+            "import1_queries": int(match.group(2)),
+            "import1_async_tasks": int(match.group(3)),
+            "reimport1_queries": int(match.group(4)),
+            "reimport1_async_tasks": int(match.group(5)),
+            "reimport2_queries": int(match.group(6)),
+            "reimport2_async_tasks": int(match.group(7)),
+        }
+
+    return expected_counts
+
+
+def generate_report(counts: list[TestCount], expected_counts: dict[str, dict[str, int]]):
+    """Generate a report of differences."""
+    if not counts:
+        print("✅ All tests passed! No count differences found.")
+        return
+
+    print("\n" + "=" * 80)
+    print("PERFORMANCE TEST COUNT DIFFERENCES REPORT")
+    print("=" * 80 + "\n")
+
+    # Group by test name
+    by_test = {}
+    for count in counts:
+        if count.test_name not in by_test:
+            by_test[count.test_name] = []
+        by_test[count.test_name].append(count)
+
+    for test_name, test_counts in sorted(by_test.items()):
+        print(f"Test: {test_name}")
+        print("-" * 80)
+        for count in sorted(test_counts, key=lambda x: (x.step, x.metric)):
+            print(
+                f"  {count.step:12} {count.metric:15} "
+                f"Expected: {count.expected:4} → Actual: {count.actual:4} "
+                f"(Difference: {count.difference:+3})",
+            )
+        print()
+
+    print("=" * 80)
+    print("\nTo update the test file, run:")
+    print(f"  python scripts/update_performance_test_counts.py --test-class {test_name.split('_')[0]} --update")
+    print()
+
+
+def update_test_file(counts: list[TestCount]):
+    """Update the test file with new expected counts."""
+    if not counts:
+        print("No counts to update.")
+        return
+
+    content = TEST_FILE.read_text()
+
+    # Create a mapping of test_name -> step_metric -> new_value
+    updates = {}
+    for count in counts:
+        if count.test_name not in updates:
+            updates[count.test_name] = {}
+        step_metric = f"{count.step}_{count.metric}"
+        updates[count.test_name][step_metric] = count.actual
+
+    # Update each test method
+    for test_name, test_updates in updates.items():
+        # Find the test method and update expected_num_queries/async_tasks values
+        # Map step_metric to parameter name
+        param_map = {
+            "import1_queries": "expected_num_queries1",
+            "import1_async_tasks": "expected_num_async_tasks1",
+            "reimport1_queries": "expected_num_queries2",
+            "reimport1_async_tasks": "expected_num_async_tasks2",
+            "reimport2_queries": "expected_num_queries3",
+            "reimport2_async_tasks": "expected_num_async_tasks3",
+        }
+
+        for step_metric, new_value in test_updates.items():
+            param_name = param_map.get(step_metric)
+            if param_name:
+                # Pattern to match: expected_num_queries3=120, (with flexible whitespace)
+                pattern = re.compile(
+                    rf"(def {re.escape(test_name)}\([^)]*\):.*?{re.escape(param_name)}\s*=\s*)(\d+)(\s*,)",
+                    re.DOTALL,
+                )
+
+                def replace(match):
+                    return f"{match.group(1)}{new_value}{match.group(3)}"  # noqa: B023
+
+                content = pattern.sub(replace, content)
+
+    # Write back to file
+    TEST_FILE.write_text(content)
+    print(f"✅ Updated {TEST_FILE}")
+    print(f"   Updated {len(counts)} count(s) across {len(updates)} test(s)")
+
+
+def verify_tests(test_class: str) -> bool:
+    """Run tests to verify they all pass."""
+    print(f"Verifying tests for {test_class}...")
+    output = run_tests(test_class)
+    counts = parse_test_output(output)
+
+    if counts:
+        print("❌ Some tests still have count mismatches:")
+        for count in counts:
+            print(f"  {count.test_name} - {count.step} {count.metric}: "
+                  f"expected {count.expected}, got {count.actual}")
+        return False
+    else:  # noqa: RET505
+        print("✅ All tests pass!")
+        return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Update performance test query counts",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--test-class",
+        required=False,
+        default="TestDojoImporterPerformanceSmall",
+        help="Test class name (e.g., TestDojoImporterPerformanceSmall). Defaults to TestDojoImporterPerformanceSmall if not provided.",
+    )
+    parser.add_argument(
+        "--report-only",
+        action="store_true",
+        help="Only generate a report, don't update the file",
+    )
+    parser.add_argument(
+        "--update",
+        action="store_true",
+        help="Update the test file with new counts",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Run tests to verify they pass",
+    )
+
+    args = parser.parse_args()
+
+    if args.report_only:
+        # Step 1: Run tests and generate report
+        output = run_tests(args.test_class)
+        counts = parse_test_output(output)
+        expected_counts = extract_expected_counts_from_file(args.test_class)
+        generate_report(counts, expected_counts)
+
+    elif args.update:
+        # Step 2: Update the file
+        output = run_tests(args.test_class)
+        counts = parse_test_output(output)
+        if counts:
+            update_test_file(counts)
+            print("\nNext step: Run --verify to ensure all tests pass")
+        else:
+            print("No differences found. All tests are already up to date.")
+
+    elif args.verify:
+        # Step 3: Verify
+        success = verify_tests(args.test_class)
+        sys.exit(0 if success else 1)
+
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
index 5f1af63f1ab..4b94d132bf6 100644
--- a/unittests/test_importers_performance.py
+++ b/unittests/test_importers_performance.py
@@ -1,3 +1,23 @@
+"""
+Performance tests for importers.
+
+These tests verify that import and reimport operations maintain acceptable query counts
+and async task counts to prevent performance regressions.
+
+Counts can be updated via the Python script at scripts/update_performance_test_counts.py.
+However, counts must be verified to ensure no implicit performance regressions are introduced.
+When counts change, review the differences carefully to determine if they represent:
+- Legitimate optimizations (counts decreasing)
+- Acceptable changes due to feature additions (counts increasing with justification)
+- Unintended performance regressions (counts increasing without clear reason)
+
+Always verify updated counts by:
+1. Running the update script to see the differences
+2. Reviewing the changes to understand why counts changed
+3. Running the verification step to ensure all tests pass
+4. Investigating any unexpected increases in query or task counts
+"""
+
 import logging
 from contextlib import contextmanager
 

From 8a57c38c87615300c616f4115a66bee22a1836b2 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 11:22:45 +0100
Subject: [PATCH 14/26] add script to update performance test counts easily

---
 ruff.toml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ruff.toml b/ruff.toml
index 670a95b7b99..cfd67b5a501 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -84,7 +84,7 @@ select = [
    "PGH",
    "PLC",
    "PLE",
-   "PLR01", "PLR02", "PLR04", "PLR0915", "PLR1711", "PLR1704", "PLR1714", "PLR1716", "PLR172", "PLR173", "PLR2044", "PLR5", "PLR6104", "PLR6201", 
+   "PLR01", "PLR02", "PLR04", "PLR0915", "PLR1711", "PLR1704", "PLR1714", "PLR1716", "PLR172", "PLR173", "PLR2044", "PLR5", "PLR6104", "PLR6201",
    "PLW",
    "UP",
    "FURB",
@@ -118,6 +118,14 @@ preview = true
 "dojo/filters.py" = [
     "A003",  # ruff upgrade to 0.13.3
 ]
+"scripts/update_performance_test_counts.py" = [
+    "S603",  # subprocess.run without shell=True is safe for this script
+    "S604",  # subprocess.run without shell=True is safe for this script
+    "S605",  # subprocess.run without shell=True is safe for this script
+    "S606",  # subprocess.run without shell=True is safe for this script
+    "S607",  # subprocess.run without shell=True is safe for this script
+    "T201",  # print statements are fine for console output scripts
+]
 
 [lint.flake8-boolean-trap]
 extend-allowed-calls = ["dojo.utils.get_system_setting"]

From 3c42df18b1e4f0cdfa6bce5431edcdf3abfb5056 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 11:37:07 +0100
Subject: [PATCH 15/26] 8a57c38c87615300c616f4115a66bee22a1836b2

---
 ruff.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ruff.toml b/ruff.toml
index cfd67b5a501..942bd47a00a 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -125,6 +125,7 @@ preview = true
     "S606",  # subprocess.run without shell=True is safe for this script
     "S607",  # subprocess.run without shell=True is safe for this script
     "T201",  # print statements are fine for console output scripts
+    "EXE001", # git won't commit the executable flag I don't know why
 ]
 
 [lint.flake8-boolean-trap]

From 943faf8d95cebd89be6f455075f8a026d24f5b5d Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 11:41:17 +0100
Subject: [PATCH 16/26] add upgrade notes

---
 docs/content/en/open_source/upgrading/2.54.md | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/docs/content/en/open_source/upgrading/2.54.md b/docs/content/en/open_source/upgrading/2.54.md
index 3286e7ee149..06741d7d9f2 100644
--- a/docs/content/en/open_source/upgrading/2.54.md
+++ b/docs/content/en/open_source/upgrading/2.54.md
@@ -5,7 +5,20 @@ weight: -20250804
 description: Dropped support for DD_PARSER_EXCLUDE
 ---
 
+## Breaking change: `DD_PARSER_EXCLUDE` removed
+
 To simplify the management of the DefectDojo application, parser exclusions are no longer controlled via the environment variable DD_PARSER_EXCLUDE or application settings. This variable is now unsupported.
 From now on, you should use the active flag in the Test_Type model to enable or disable parsers. Only parsers associated with active Test_Type entries will be available for use.
 
-There are other instructions for upgrading to 2.54.x. Check the Release Notes for the contents of the release.
+## Import/reimport performance improvements
+
+DefectDojo 2.54.x includes performance improvements for importing and reimporting scan results, especially for large scans:
+
+- **Faster imports and reimports** due to fewer database queries and more bulk operations.
+- **Reduced database load** during reimport matching and post-processing (helps avoid slowdowns/timeouts under heavy scan volume).
+- **More efficient endpoint status updates** during reimport of dynamic findings.
+- **Less churn when updating vulnerability IDs**, avoiding unnecessary deletes/writes when nothing changed.
+
+No action is required after upgrading. (Optional tuning knobs exist via `DD_IMPORT_REIMPORT_MATCH_BATCH_SIZE` and `DD_IMPORT_REIMPORT_DEDUPE_BATCH_SIZE`.)
+
+There are other instructions for upgrading to 2.54.x. Check the Release Notes for the contents of the release: `https://github.com/DefectDojo/django-DefectDojo/releases/tag/2.54.0`

From bb40290764b80bd4166532c4302484471ad65523 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 11:41:50 +0100
Subject: [PATCH 17/26] add upgrade notes

---
 docs/content/en/open_source/upgrading/2.54.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/content/en/open_source/upgrading/2.54.md b/docs/content/en/open_source/upgrading/2.54.md
index 06741d7d9f2..82f52ede847 100644
--- a/docs/content/en/open_source/upgrading/2.54.md
+++ b/docs/content/en/open_source/upgrading/2.54.md
@@ -2,7 +2,7 @@
 title: 'Upgrading to DefectDojo Version 2.54.x'
 toc_hide: true
 weight: -20250804
-description: Dropped support for DD_PARSER_EXCLUDE
+description: Dropped support for DD_PARSER_EXCLUDE & Reimport performance improvements
 ---
 
 ## Breaking change: `DD_PARSER_EXCLUDE` removed

From 5ea1cb14ab7ba6fe0bcd4a144f12886e796441de Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijn.scholten@iodigital.com>
Date: Sun, 14 Dec 2025 16:50:26 +0100
Subject: [PATCH 18/26] remove obsolete method

---
 dojo/finding/deduplication.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py
index 10c5bf9e10f..7c4b766e8f2 100644
--- a/dojo/finding/deduplication.py
+++ b/dojo/finding/deduplication.py
@@ -269,15 +269,6 @@ def build_candidate_scope_queryset(test, mode="deduplication", service=None):
     )
 
 
-# TODO: remove this if that doesn't affect Pro
-def build_dedupe_scope_queryset(test):
-    """
-    Legacy function name for backward compatibility.
-    Use build_candidate_scope_queryset() instead.
-    """
-    return build_candidate_scope_queryset(test, mode="deduplication")
-
-
 def find_candidates_for_deduplication_hash(test, findings, mode="deduplication", service=None):
     """
     Find candidates by hash_code. Works for both deduplication and reimport.

From 58d2abd48007645057d156701131a9dec3669751 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 17:23:48 +0100
Subject: [PATCH 19/26] remove obsolete method

---
 dojo/finding/deduplication.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dojo/finding/deduplication.py b/dojo/finding/deduplication.py
index 7c4b766e8f2..dd075be1c08 100644
--- a/dojo/finding/deduplication.py
+++ b/dojo/finding/deduplication.py
@@ -376,7 +376,7 @@ def find_candidates_for_deduplication_uid_or_hash(test, findings, mode="deduplic
 
 
 def find_candidates_for_deduplication_legacy(test, findings):
-    base_queryset = build_dedupe_scope_queryset(test)
+    base_queryset = build_candidate_scope_queryset(test, mode="deduplication")
     titles = {f.title for f in findings if getattr(f, "title", None)}
     cwes = {f.cwe for f in findings if getattr(f, "cwe", 0)}
     cwes.discard(0)

From daac9c86c8b08a70cd3581f67d9846ab5eefae38 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 18:32:08 +0100
Subject: [PATCH 20/26] reimport: extract method to get candidates

---
 dojo/importers/default_reimporter.py | 63 ++++++++++++++++++++--------
 1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
index 80aad31bab0..97b7959d56d 100644
--- a/dojo/importers/default_reimporter.py
+++ b/dojo/importers/default_reimporter.py
@@ -159,6 +159,48 @@ def process_scan(
             test_import_history,
         )
 
+    def get_reimport_match_candidates_for_batch(
+        self,
+        batch_findings: list[Finding],
+    ) -> tuple[dict, dict, dict]:
+        """
+        Fetch candidate matches for a batch of *unsaved* findings during reimport.
+
+        This is intentionally a separate method so downstream editions (e.g. Dojo Pro)
+        can override candidate retrieval without copying the full `process_findings()`
+        implementation.
+
+        Returns:
+            (candidates_by_hash, candidates_by_uid, candidates_by_key)
+
+        """
+        candidates_by_hash: dict = {}
+        candidates_by_uid: dict = {}
+        candidates_by_key: dict = {}
+
+        if self.deduplication_algorithm == "hash_code":
+            candidates_by_hash = find_candidates_for_deduplication_hash(
+                self.test,
+                batch_findings,
+                mode="reimport",
+            )
+        elif self.deduplication_algorithm == "unique_id_from_tool":
+            candidates_by_uid = find_candidates_for_deduplication_unique_id(
+                self.test,
+                batch_findings,
+                mode="reimport",
+            )
+        elif self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
+            candidates_by_uid, candidates_by_hash = find_candidates_for_deduplication_uid_or_hash(
+                self.test,
+                batch_findings,
+                mode="reimport",
+            )
+        elif self.deduplication_algorithm == "legacy":
+            candidates_by_key = find_candidates_for_reimport_legacy(self.test, batch_findings)
+
+        return candidates_by_hash, candidates_by_uid, candidates_by_key
+
     def process_findings(
         self,
         parsed_findings: list[Finding],
@@ -242,24 +284,9 @@ def process_findings(
                 deduplicationLogger.debug(f"unsaved finding's hash_code: {unsaved_finding.hash_code}")
 
             # Fetch all candidates for this batch at once (batch candidate finding)
-            candidates_by_hash = {}
-            candidates_by_uid = {}
-            candidates_by_key = {}
-
-            if self.deduplication_algorithm == "hash_code":
-                candidates_by_hash = find_candidates_for_deduplication_hash(
-                    self.test, batch_findings, mode="reimport",
-                )
-            elif self.deduplication_algorithm == "unique_id_from_tool":
-                candidates_by_uid = find_candidates_for_deduplication_unique_id(
-                    self.test, batch_findings, mode="reimport",
-                )
-            elif self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
-                candidates_by_uid, candidates_by_hash = find_candidates_for_deduplication_uid_or_hash(
-                    self.test, batch_findings, mode="reimport",
-                )
-            elif self.deduplication_algorithm == "legacy":
-                candidates_by_key = find_candidates_for_reimport_legacy(self.test, batch_findings)
+            candidates_by_hash, candidates_by_uid, candidates_by_key = self.get_reimport_match_candidates_for_batch(
+                batch_findings,
+            )
 
             # Process each finding in the batch using pre-fetched candidates
             for idx, unsaved_finding in enumerate(batch_findings):

From edc34bb46cd96296e293dbd2f9239b9e83db63dc Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 23:18:24 +0100
Subject: [PATCH 21/26] reimport: remove fallback to non-batch

---
 dojo/importers/default_reimporter.py | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
index 97b7959d56d..1abc9dadf77 100644
--- a/dojo/importers/default_reimporter.py
+++ b/dojo/importers/default_reimporter.py
@@ -550,27 +550,23 @@ def match_finding_for_reimport(
         deduplicationLogger.debug("matching finding for reimport using algorithm: %s", self.deduplication_algorithm)
 
         if self.deduplication_algorithm == "hash_code":
-            if candidates_by_hash is None:
-                # Fallback to individual query if candidates not provided
-                return self.match_new_finding_to_existing_finding(unsaved_finding)
-            if unsaved_finding.hash_code is None:
+            if candidates_by_hash is None or unsaved_finding.hash_code is None:
                 return []
             matches = candidates_by_hash.get(unsaved_finding.hash_code, [])
             return sorted(matches, key=lambda f: f.id)
 
         if self.deduplication_algorithm == "unique_id_from_tool":
-            if candidates_by_uid is None:
-                # Fallback to individual query if candidates not provided
-                return self.match_new_finding_to_existing_finding(unsaved_finding)
-            if unsaved_finding.unique_id_from_tool is None:
+            if candidates_by_uid is None or unsaved_finding.unique_id_from_tool is None:
                 return []
             matches = candidates_by_uid.get(unsaved_finding.unique_id_from_tool, [])
             return sorted(matches, key=lambda f: f.id)
 
         if self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
-            if candidates_by_hash is None or candidates_by_uid is None:
-                # Fallback to individual query if candidates not provided
-                return self.match_new_finding_to_existing_finding(unsaved_finding)
+            if candidates_by_hash is None and candidates_by_uid is None:
+                return []
+
+            if unsaved_finding.hash_code is None and unsaved_finding.unique_id_from_tool is None:
+                return []
 
             # Collect matches from both hash_code and unique_id_from_tool
             matches_by_id = {}
@@ -589,10 +585,7 @@ def match_finding_for_reimport(
             return sorted(matches, key=lambda f: f.id)
 
         if self.deduplication_algorithm == "legacy":
-            if candidates_by_key is None:
-                # Fallback to individual query if candidates not provided
-                return self.match_new_finding_to_existing_finding(unsaved_finding)
-            if not unsaved_finding.title:
+            if candidates_by_key is None or not unsaved_finding.title:
                 return []
             key = (unsaved_finding.title.lower(), unsaved_finding.severity)
             matches = candidates_by_key.get(key, [])

From 3c15ebc29dd4d671c5287fd025550cb5001c2243 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Sun, 14 Dec 2025 23:18:42 +0100
Subject: [PATCH 22/26] reimport: prep for Pro overrides

---
 dojo/importers/base_importer.py      |   1 +
 dojo/importers/default_reimporter.py | 117 +++++++++++++--------------
 2 files changed, 56 insertions(+), 62 deletions(-)

diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py
index b295028ef27..5007004c635 100644
--- a/dojo/importers/base_importer.py
+++ b/dojo/importers/base_importer.py
@@ -271,6 +271,7 @@ def determine_process_method(
     def determine_deduplication_algorithm(self) -> str:
         """
         Determines what dedupe algorithm to use for the Test being processed.
+        Overridden in Pro.
         :return: A string representing the dedupe algorithm to use.
         """
         return self.test.deduplication_algorithm
diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
index 1abc9dadf77..40d1120d12c 100644
--- a/dojo/importers/default_reimporter.py
+++ b/dojo/importers/default_reimporter.py
@@ -170,6 +170,8 @@ def get_reimport_match_candidates_for_batch(
         can override candidate retrieval without copying the full `process_findings()`
         implementation.
 
+        Is overridden in Pro.
+
         Returns:
             (candidates_by_hash, candidates_by_uid, candidates_by_key)
 
@@ -201,6 +203,51 @@ def get_reimport_match_candidates_for_batch(
 
         return candidates_by_hash, candidates_by_uid, candidates_by_key
 
+    def add_new_finding_to_candidates(
+        self,
+        finding: Finding,
+        candidates_by_hash: dict,
+        candidates_by_uid: dict,
+        candidates_by_key: dict,
+    ) -> None:
+        """
+        Add a newly created finding to candidate dictionaries for subsequent findings in the same batch.
+
+        This allows duplicates within the same scan report to be detected even when they're processed
+        in the same batch. When a new finding is created (no match found), it is added to the candidate
+        dictionaries so that subsequent findings in the same batch can match against it.
+
+        This is intentionally a separate method so downstream editions (e.g. Dojo Pro)
+        can override candidate addition logic without copying the full `process_findings()`
+        implementation.
+
+        Args:
+            finding: The newly created finding to add to candidates
+            candidates_by_hash: Dictionary mapping hash_code to list of findings (modified in-place)
+            candidates_by_uid: Dictionary mapping unique_id_from_tool to list of findings (modified in-place)
+            candidates_by_key: Dictionary mapping (title_lower, severity) to list of findings (modified in-place)
+
+        """
+        if not finding:
+            return
+
+        if finding.hash_code:
+            candidates_by_hash.setdefault(finding.hash_code, []).append(finding)
+            deduplicationLogger.debug(
+                f"Added finding {finding.id} (hash_code: {finding.hash_code}) to candidates for next findings in this report",
+            )
+        if finding.unique_id_from_tool:
+            candidates_by_uid.setdefault(finding.unique_id_from_tool, []).append(finding)
+            deduplicationLogger.debug(
+                f"Added finding {finding.id} (unique_id_from_tool: {finding.unique_id_from_tool}) to candidates for next findings in this report",
+            )
+        if finding.title:
+            legacy_key = (finding.title.lower(), finding.severity)
+            candidates_by_key.setdefault(legacy_key, []).append(finding)
+            deduplicationLogger.debug(
+                f"Added finding {finding.id} (title: {finding.title}, severity: {finding.severity}) to candidates for next findings in this report",
+            )
+
     def process_findings(
         self,
         parsed_findings: list[Finding],
@@ -293,7 +340,7 @@ def process_findings(
                 is_final = is_final_batch and idx == len(batch_findings) - 1
 
                 # Match any findings to this new one coming in using pre-fetched candidates
-                matched_findings = self.match_finding_for_reimport(
+                matched_findings = self.match_finding_to_candidate_reimport(
                     unsaved_finding,
                     candidates_by_hash=candidates_by_hash,
                     candidates_by_uid=candidates_by_uid,
@@ -325,23 +372,12 @@ def process_findings(
                     finding = self.process_finding_that_was_not_matched(unsaved_finding)
 
                     # Add newly created finding to candidates for subsequent findings in this batch
-                    if finding:
-                        if finding.hash_code:
-                            candidates_by_hash.setdefault(finding.hash_code, []).append(finding)
-                            deduplicationLogger.debug(
-                                f"Added finding {finding.id} (hash_code: {finding.hash_code}) to candidates for next findings in this report",
-                            )
-                        if finding.unique_id_from_tool:
-                            candidates_by_uid.setdefault(finding.unique_id_from_tool, []).append(finding)
-                            deduplicationLogger.debug(
-                                f"Added finding {finding.id} (unique_id_from_tool: {finding.unique_id_from_tool}) to candidates for next findings in this report",
-                            )
-                        if finding.title:
-                            legacy_key = (finding.title.lower(), finding.severity)
-                            candidates_by_key.setdefault(legacy_key, []).append(finding)
-                            deduplicationLogger.debug(
-                                f"Added finding {finding.id} (title: {finding.title}, severity: {finding.severity}) to candidates for next findings in this report",
-                            )
+                    self.add_new_finding_to_candidates(
+                        finding,
+                        candidates_by_hash,
+                        candidates_by_uid,
+                        candidates_by_key,
+                    )
 
                 # This condition __appears__ to always be true, but am afraid to remove it
                 if finding:
@@ -483,50 +519,7 @@ def parse_findings_dynamic_test_type(
         logger.debug("REIMPORT_SCAN parser v2: Create parse findings")
         return super().parse_findings_dynamic_test_type(scan, parser)
 
-    def match_new_finding_to_existing_finding(
-        self,
-        unsaved_finding: Finding,
-    ) -> list[Finding]:
-        """Matches a single new finding to N existing findings and then returns those matches"""
-        # This code should match the logic used for deduplication out of the re-import feature.
-        # See utils.py deduplicate_* functions
-        deduplicationLogger.debug("return findings bases on algorithm: %s", self.deduplication_algorithm)
-        if self.deduplication_algorithm == "hash_code":
-            return Finding.objects.filter(
-                test=self.test,
-                hash_code=unsaved_finding.hash_code,
-            ).exclude(hash_code=None).order_by("id")
-        if self.deduplication_algorithm == "unique_id_from_tool":
-            deduplicationLogger.debug(f"unique_id_from_tool: {unsaved_finding.unique_id_from_tool}")
-            return Finding.objects.filter(
-                test=self.test,
-                unique_id_from_tool=unsaved_finding.unique_id_from_tool,
-            ).exclude(unique_id_from_tool=None).order_by("id")
-        if self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
-            deduplicationLogger.debug(f"unique_id_from_tool: {unsaved_finding.unique_id_from_tool}")
-            deduplicationLogger.debug(f"hash_code: {unsaved_finding.hash_code}")
-            query = Finding.objects.filter(
-                Q(test=self.test),
-                (Q(hash_code__isnull=False) & Q(hash_code=unsaved_finding.hash_code))
-                | (Q(unique_id_from_tool__isnull=False) & Q(unique_id_from_tool=unsaved_finding.unique_id_from_tool)),
-            ).order_by("id")
-            deduplicationLogger.debug(query.query)
-            return query
-        if self.deduplication_algorithm == "legacy":
-            # This is the legacy reimport behavior. Although it's pretty flawed and doesn't match the legacy algorithm for deduplication,
-            # this is left as is for simplicity.
-            # Re-writing the legacy deduplication here would be complicated and counter-productive.
-            # If you have use cases going through this section, you're advised to create a deduplication configuration for your parser
-            logger.warning("Legacy reimport. In case of issue, you're advised to create a deduplication configuration in order not to go through this section")
-            return Finding.objects.filter(
-                    title__iexact=unsaved_finding.title,
-                    test=self.test,
-                    severity=unsaved_finding.severity,
-                    numerical_severity=Finding.get_numerical_severity(unsaved_finding.severity)).order_by("id")
-        logger.error(f'Internal error: unexpected deduplication_algorithm: "{self.deduplication_algorithm}"')
-        return None
-
-    def match_finding_for_reimport(
+    def match_finding_to_candidate_reimport(
         self,
         unsaved_finding: Finding,
         candidates_by_hash: dict | None = None,

From 0d99393b836d941edc491fbd75a94d22d53abbf9 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijn.scholten@iodigital.com>
Date: Sun, 14 Dec 2025 23:22:53 +0100
Subject: [PATCH 23/26] add comment

---
 dojo/importers/default_reimporter.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
index 40d1120d12c..9546156f578 100644
--- a/dojo/importers/default_reimporter.py
+++ b/dojo/importers/default_reimporter.py
@@ -221,6 +221,8 @@ def add_new_finding_to_candidates(
         can override candidate addition logic without copying the full `process_findings()`
         implementation.
 
+        Is overriden in Pro
+
         Args:
             finding: The newly created finding to add to candidates
             candidates_by_hash: Dictionary mapping hash_code to list of findings (modified in-place)

From 197fe43bc5d7ab7e4f9e4da05da2a715aa58f08c Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijn.scholten@iodigital.com>
Date: Sun, 14 Dec 2025 23:23:22 +0100
Subject: [PATCH 24/26] add comment

---
 dojo/importers/default_reimporter.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
index 9546156f578..14a7da7e21d 100644
--- a/dojo/importers/default_reimporter.py
+++ b/dojo/importers/default_reimporter.py
@@ -217,10 +217,6 @@ def add_new_finding_to_candidates(
         in the same batch. When a new finding is created (no match found), it is added to the candidate
         dictionaries so that subsequent findings in the same batch can match against it.
 
-        This is intentionally a separate method so downstream editions (e.g. Dojo Pro)
-        can override candidate addition logic without copying the full `process_findings()`
-        implementation.
-
         Is overriden in Pro
 
         Args:

From e8f60991cc0c57414ce304286262e569d81c0703 Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Wed, 17 Dec 2025 14:46:14 +0100
Subject: [PATCH 25/26] update counts and script

---
 scripts/update_performance_test_counts.py | 130 +++++++++++++++++-----
 unittests/test_importers_performance.py   |  16 +--
 2 files changed, 110 insertions(+), 36 deletions(-)

diff --git a/scripts/update_performance_test_counts.py b/scripts/update_performance_test_counts.py
index f63e040fb3a..867c01bb280 100644
--- a/scripts/update_performance_test_counts.py
+++ b/scripts/update_performance_test_counts.py
@@ -11,19 +11,20 @@
 
 How to run:
 
-    # Step 1: Run tests and generate report (uses TestDojoImporterPerformanceSmall by default)
-    python3 scripts/update_performance_test_counts.py --report-only
+    # Default: Update the test file (uses TestDojoImporterPerformanceSmall by default)
+    python3 scripts/update_performance_test_counts.py
 
     # Or specify a different test class:
-    python3 scripts/update_performance_test_counts.py --test-class TestDojoImporterPerformanceSmall --report-only
+    python3 scripts/update_performance_test_counts.py --test-class TestDojoImporterPerformanceSmall
 
-    # Step 2: Review the report, then update the file
-    python3 scripts/update_performance_test_counts.py --update
+    # Step 1: Run tests and generate report only (without updating)
+    python3 scripts/update_performance_test_counts.py --report-only
 
-    # Step 3: Verify all tests pass
+    # Step 2: Verify all tests pass
     python3 scripts/update_performance_test_counts.py --verify
 
 The script defaults to TestDojoImporterPerformanceSmall if --test-class is not provided.
+The script defaults to --update behavior if no action flag is provided.
 """
 
 import argparse
@@ -55,18 +56,80 @@ def __repr__(self):
         )
 
 
-def run_tests(test_class: str) -> str:
-    """Run the performance tests and return the output."""
+def run_tests(test_class: str) -> tuple[str, int]:
+    """Run the performance tests and return the output and return code."""
     print(f"Running tests for {test_class}...")
     cmd = [
         "./run-unittest.sh",
         "--test-case",
         f"unittests.test_importers_performance.{test_class}",
     ]
-    result = subprocess.run(
-        cmd, check=False, capture_output=True, text=True, cwd=Path(__file__).parent.parent,
+
+    # Run with real-time output streaming
+    process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        cwd=Path(__file__).parent.parent,
     )
-    return result.stdout + result.stderr
+
+    output_lines = []
+    for line in process.stdout:
+        print(line, end="")  # Print in real-time
+        output_lines.append(line)
+
+    process.wait()
+    output = "".join(output_lines)
+
+    return output, process.returncode
+
+
+def check_test_execution_success(output: str, return_code: int) -> tuple[bool, str]:
+    """Check if tests executed successfully or failed due to other reasons."""
+    # Check for migration errors
+    migration_error_patterns = [
+        r"django\.db\.migrations\.exceptions\.",
+        r"Migration.*failed",
+        r"django\.core\.management\.base\.CommandError",
+        r"OperationalError",
+        r"ProgrammingError",
+        r"relation.*does not exist",
+        r"no such table",
+    ]
+
+    for pattern in migration_error_patterns:
+        if re.search(pattern, output, re.IGNORECASE):
+            return False, f"Migration or database error detected: {pattern}"
+
+    # Check if any tests actually ran
+    test_run_patterns = [
+        r"Ran \d+ test",
+        r"OK",
+        r"FAILED",
+        r"FAIL:",
+        r"test_\w+",
+    ]
+
+    tests_ran = any(re.search(pattern, output) for pattern in test_run_patterns)
+
+    if not tests_ran and return_code != 0:
+        return False, "Tests did not run successfully. Check the output above for errors."
+
+    # Check for other critical errors
+    critical_error_patterns = [
+        r"ImportError",
+        r"ModuleNotFoundError",
+        r"SyntaxError",
+        r"IndentationError",
+    ]
+
+    for pattern in critical_error_patterns:
+        if re.search(pattern, output):
+            return False, f"Critical error detected: {pattern}"
+
+    return True, ""
 
 
 def parse_test_output(output: str) -> list[TestCount]:
@@ -249,17 +312,23 @@ def replace(match):
 def verify_tests(test_class: str) -> bool:
     """Run tests to verify they all pass."""
     print(f"Verifying tests for {test_class}...")
-    output = run_tests(test_class)
+    output, return_code = run_tests(test_class)
+
+    success, error_msg = check_test_execution_success(output, return_code)
+    if not success:
+        print(f"\n❌ Test execution failed: {error_msg}")
+        return False
+
     counts = parse_test_output(output)
 
     if counts:
-        print("❌ Some tests still have count mismatches:")
+        print("\n❌ Some tests still have count mismatches:")
         for count in counts:
             print(f"  {count.test_name} - {count.step} {count.metric}: "
                   f"expected {count.expected}, got {count.actual}")
         return False
     else:  # noqa: RET505
-        print("✅ All tests pass!")
+        print("\n✅ All tests pass!")
         return True
 
 
@@ -283,7 +352,7 @@ def main():
     parser.add_argument(
         "--update",
         action="store_true",
-        help="Update the test file with new counts",
+        help="Update the test file with new counts (default behavior if no action flag is provided)",
     )
     parser.add_argument(
         "--verify",
@@ -295,29 +364,34 @@ def main():
 
     if args.report_only:
         # Step 1: Run tests and generate report
-        output = run_tests(args.test_class)
+        output, return_code = run_tests(args.test_class)
+        success, error_msg = check_test_execution_success(output, return_code)
+        if not success:
+            print(f"\n❌ Test execution failed: {error_msg}")
+            sys.exit(1)
         counts = parse_test_output(output)
         expected_counts = extract_expected_counts_from_file(args.test_class)
         generate_report(counts, expected_counts)
 
-    elif args.update:
-        # Step 2: Update the file
-        output = run_tests(args.test_class)
-        counts = parse_test_output(output)
-        if counts:
-            update_test_file(counts)
-            print("\nNext step: Run --verify to ensure all tests pass")
-        else:
-            print("No differences found. All tests are already up to date.")
-
     elif args.verify:
         # Step 3: Verify
         success = verify_tests(args.test_class)
         sys.exit(0 if success else 1)
 
     else:
-        parser.print_help()
-        sys.exit(1)
+        # Default: Update the file (--update is the default behavior)
+        output, return_code = run_tests(args.test_class)
+        success, error_msg = check_test_execution_success(output, return_code)
+        if not success:
+            print(f"\n❌ Test execution failed: {error_msg}")
+            print("Cannot update counts because tests did not run successfully.")
+            sys.exit(1)
+        counts = parse_test_output(output)
+        if counts:
+            update_test_file(counts)
+            print("\nNext step: Run --verify to ensure all tests pass")
+        else:
+            print("\nNo differences found. All tests are already up to date.")
 
 
 if __name__ == "__main__":
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
index 81cc0935594..b2dc1c07192 100644
--- a/unittests/test_importers_performance.py
+++ b/unittests/test_importers_performance.py
@@ -267,9 +267,9 @@ def test_import_reimport_reimport_performance_pghistory_async(self):
         self._import_reimport_performance(
             expected_num_queries1=306,
             expected_num_async_tasks1=7,
-            expected_num_queries2=231,
+            expected_num_queries2=232,
             expected_num_async_tasks2=18,
-            expected_num_queries3=113,
+            expected_num_queries3=114,
             expected_num_async_tasks3=17,
         )
 
@@ -287,11 +287,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async(self):
         testuser.usercontactinfo.save()
 
         self._import_reimport_performance(
-            expected_num_queries1=312,
+            expected_num_queries1=313,
             expected_num_async_tasks1=6,
-            expected_num_queries2=287,
+            expected_num_queries2=239,
             expected_num_async_tasks2=17,
-            expected_num_queries3=176,
+            expected_num_queries3=121,
             expected_num_async_tasks3=16,
         )
 
@@ -310,11 +310,11 @@ def test_import_reimport_reimport_performance_pghistory_no_async_with_product_gr
         self.system_settings(enable_product_grade=True)
 
         self._import_reimport_performance(
-            expected_num_queries1=314,
+            expected_num_queries1=315,
             expected_num_async_tasks1=8,
-            expected_num_queries2=289,
+            expected_num_queries2=241,
             expected_num_async_tasks2=19,
-            expected_num_queries3=178,
+            expected_num_queries3=123,
             expected_num_async_tasks3=18,
         )
 

From 71d2e9192528704675237ca2ef172126e8386dce Mon Sep 17 00:00:00 2001
From: Valentijn Scholten <valentijnscholten@gmail.com>
Date: Wed, 17 Dec 2025 15:13:56 +0100
Subject: [PATCH 26/26] update counts and script

---
 scripts/update_performance_test_counts.py | 421 ++++++++++++++++++----
 unittests/test_importers_performance.py   |   5 +-
 2 files changed, 348 insertions(+), 78 deletions(-)

diff --git a/scripts/update_performance_test_counts.py b/scripts/update_performance_test_counts.py
index 867c01bb280..f7cfaae2859 100644
--- a/scripts/update_performance_test_counts.py
+++ b/scripts/update_performance_test_counts.py
@@ -56,8 +56,62 @@ def __repr__(self):
         )
 
 
+def extract_test_methods(test_class: str) -> list[str]:
+    """Extract all test method names from the test class."""
+    if not TEST_FILE.exists():
+        msg = f"Test file not found: {TEST_FILE}"
+        raise FileNotFoundError(msg)
+
+    content = TEST_FILE.read_text()
+
+    # Find the test class definition
+    class_pattern = re.compile(
+        rf"class {re.escape(test_class)}.*?(?=class |\Z)",
+        re.DOTALL,
+    )
+    class_match = class_pattern.search(content)
+    if not class_match:
+        return []
+
+    class_content = class_match.group(0)
+
+    # Find all test methods in this class
+    test_method_pattern = re.compile(r"def (test_\w+)\(")
+    return test_method_pattern.findall(class_content)
+
+
+def run_test_method(test_class: str, test_method: str) -> tuple[str, int]:
+    """Run a specific test method and return the output and return code."""
+    print(f"Running {test_class}.{test_method}...")
+    cmd = [
+        "./run-unittest.sh",
+        "--test-case",
+        f"unittests.test_importers_performance.{test_class}.{test_method}",
+    ]
+
+    # Run with real-time output streaming
+    process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        cwd=Path(__file__).parent.parent,
+    )
+
+    output_lines = []
+    for line in process.stdout:
+        print(line, end="")  # Print in real-time
+        output_lines.append(line)
+
+    process.wait()
+    output = "".join(output_lines)
+
+    return output, process.returncode
+
+
 def run_tests(test_class: str) -> tuple[str, int]:
-    """Run the performance tests and return the output and return code."""
+    """Run all tests in a test class and return the output and return code."""
     print(f"Running tests for {test_class}...")
     cmd = [
         "./run-unittest.sh",
@@ -136,47 +190,103 @@ def parse_test_output(output: str) -> list[TestCount]:
     """Parse test output to extract actual vs expected counts."""
     counts = []
 
-    # Pattern to match AssertionError with query counts
-    # Format: AssertionError: 118 != 120 : 118 queries executed, 120 expected
-    assertion_pattern = re.compile(
-        r"AssertionError: (\d+) != (\d+) : (\d+) queries executed, (\d+) expected",
+    # Debug: Save a sample of the output to help diagnose parsing issues
+    if "FAIL:" in output:
+        # Extract failure sections for debugging
+        fail_sections = []
+        lines = output.split("\n")
+        in_fail_section = False
+        fail_section = []
+        for line in lines:
+            if "FAIL:" in line:
+                if fail_section:
+                    fail_sections.append("\n".join(fail_section))
+                fail_section = [line]
+                in_fail_section = True
+            elif in_fail_section:
+                fail_section.append(line)
+                # Stop collecting after AssertionError line or after 5 more lines
+                if "AssertionError:" in line or len(fail_section) > 6:
+                    fail_sections.append("\n".join(fail_section))
+                    fail_section = []
+                    in_fail_section = False
+        if fail_section:
+            fail_sections.append("\n".join(fail_section))
+
+        if fail_sections:
+            print(f"\n🔍 Found {len(fail_sections)} failure section(s) in output")
+
+    # The test output format is:
+    # FAIL: test_name (step='import1', metric='queries')
+    # AssertionError: 118 != 120 : 118 queries executed, 120 expected
+    # OR for async tasks:
+    # FAIL: test_name (step='import1', metric='async_tasks')
+    # AssertionError: 7 != 8 : 7 async tasks executed, 8 expected
+
+    # Pattern to match the full failure block:
+    # FAIL: test_name (full.path.to.test) (step='...', metric='...')
+    # AssertionError: actual != expected : actual ... executed, expected expected
+    # The test name may include the full path in parentheses, so we extract just the method name
+    failure_pattern = re.compile(
+        r"FAIL:\s+(test_\w+)\s+\([^)]+\)\s+\(step=['\"](\w+)['\"],\s*metric=['\"](\w+)['\"]\)\s*\n"
+        r".*?AssertionError:\s+(\d+)\s+!=\s+(\d+)\s+:\s+\d+\s+(?:queries|async tasks?)\s+executed,\s+\d+\s+expected",
+        re.MULTILINE | re.DOTALL,
     )
 
-    # Pattern to match test name and step
-    # Format: (step='reimport2', metric='queries')
-    step_pattern = re.compile(r"step='(\w+)', metric='(\w+)'")
-
-    # Pattern to match test name
-    # Format: FAIL: test_import_reimport_reimport_performance_async
-    test_pattern = re.compile(r"FAIL: (test_\w+)")
-
-    lines = output.split("\n")
-    current_test = None
-    current_step = None
-    current_metric = None
-
-    for line in lines:
-        # Match test name
-        test_match = test_pattern.search(line)
-        if test_match:
-            current_test = test_match.group(1)
-
-        # Match step and metric
-        step_match = step_pattern.search(line)
-        if step_match:
-            current_step = step_match.group(1)
-            current_metric = step_match.group(2)
-
-        # Match assertion error
-        assertion_match = assertion_pattern.search(line)
-        if assertion_match and current_test and current_step and current_metric:
-            actual = int(assertion_match.group(1))
-            expected = int(assertion_match.group(2))
-            count = TestCount(current_test, current_step, current_metric)
-            count.actual = actual
-            count.expected = expected
-            count.difference = expected - actual
-            counts.append(count)
+    for match in failure_pattern.finditer(output):
+        test_name = match.group(1)
+        step = match.group(2)
+        metric = match.group(3)
+        actual = int(match.group(4))
+        expected = int(match.group(5))
+
+        count = TestCount(test_name, step, metric)
+        count.actual = actual
+        count.expected = expected
+        count.difference = expected - actual
+        counts.append(count)
+
+    # Also try a simpler pattern in case the format is slightly different
+    if not counts:
+        # Look for lines with step/metric followed by AssertionError on nearby lines
+        lines = output.split("\n")
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+
+            # Look for FAIL: test_name (may include full path in parentheses)
+            # Format: FAIL: test_name (full.path) (step='...', metric='...')
+            fail_match = re.search(r"FAIL:\s+(test_\w+)\s+\([^)]+\)\s+\(step=['\"](\w+)['\"],\s*metric=['\"](\w+)['\"]\)", line)
+            if fail_match:
+                test_name = fail_match.group(1)
+                step = fail_match.group(2)
+                metric = fail_match.group(3)
+                # Look ahead for AssertionError
+                for j in range(i, min(i + 15, len(lines))):
+                    assertion_match = re.search(
+                        r"AssertionError:\s+(\d+)\s+!=\s+(\d+)\s+:\s+\d+\s+(?:queries|async tasks?)\s+executed,\s+\d+\s+expected",
+                        lines[j],
+                    )
+
+                    if assertion_match:
+                        actual = int(assertion_match.group(1))
+                        expected = int(assertion_match.group(2))
+
+                        count = TestCount(test_name, step, metric)
+                        count.actual = actual
+                        count.expected = expected
+                        count.difference = expected - actual
+                        counts.append(count)
+                        break
+            i += 1
+
+    if counts:
+        print(f"\n📊 Parsed {len(counts)} count mismatch(es) from test output:")
+        for count in counts:
+            print(f"  {count.test_name} - {count.step} {count.metric}: {count.actual} != {count.expected}")
+    elif "FAIL:" in output:
+        print("\n⚠️  WARNING: Found FAIL in output but couldn't parse any count mismatches!")
+        print("This might indicate a parsing issue. Check the output above.")
 
     return counts
 
@@ -276,32 +386,137 @@ def update_test_file(counts: list[TestCount]):
         step_metric = f"{count.step}_{count.metric}"
         updates[count.test_name][step_metric] = count.actual
 
+    # Map step_metric to parameter name for different methods
+    param_map_import_reimport = {
+        "import1_queries": "expected_num_queries1",
+        "import1_async_tasks": "expected_num_async_tasks1",
+        "reimport1_queries": "expected_num_queries2",
+        "reimport1_async_tasks": "expected_num_async_tasks2",
+        "reimport2_queries": "expected_num_queries3",
+        "reimport2_async_tasks": "expected_num_async_tasks3",
+    }
+    param_map_deduplication = {
+        "first_import_queries": "expected_num_queries1",
+        "first_import_async_tasks": "expected_num_async_tasks1",
+        "second_import_queries": "expected_num_queries2",
+        "second_import_async_tasks": "expected_num_async_tasks2",
+    }
+
     # Update each test method
     for test_name, test_updates in updates.items():
-        # Find the test method and update expected_num_queries/async_tasks values
-        # Map step_metric to parameter name
-        param_map = {
-            "import1_queries": "expected_num_queries1",
-            "import1_async_tasks": "expected_num_async_tasks1",
-            "reimport1_queries": "expected_num_queries2",
-            "reimport1_async_tasks": "expected_num_async_tasks2",
-            "reimport2_queries": "expected_num_queries3",
-            "reimport2_async_tasks": "expected_num_async_tasks3",
-        }
+        print(f"  Updating {test_name}...")
+        # Find the test method boundaries
+        test_method_pattern = re.compile(
+            rf"(def {re.escape(test_name)}\([^)]*\):.*?)(?=def test_|\Z)",
+            re.DOTALL,
+        )
+        test_match = test_method_pattern.search(content)
+        if not test_match:
+            print(f"⚠️  Warning: Could not find test method {test_name}")
+            continue
+
+        test_method_content = test_match.group(1)
+        test_method_start = test_match.start()
+        test_method_end = test_match.end()
+
+        # Try to find _import_reimport_performance call first
+        perf_call_pattern_import_reimport = re.compile(
+            r"(self\._import_reimport_performance\s*\(\s*)"
+            r"expected_num_queries1\s*=\s*(\d+)\s*,\s*"
+            r"expected_num_async_tasks1\s*=\s*(\d+)\s*,\s*"
+            r"expected_num_queries2\s*=\s*(\d+)\s*,\s*"
+            r"expected_num_async_tasks2\s*=\s*(\d+)\s*,\s*"
+            r"expected_num_queries3\s*=\s*(\d+)\s*,\s*"
+            r"expected_num_async_tasks3\s*=\s*(\d+)\s*,"
+            r"(\s*\))",
+            re.DOTALL,
+        )
 
-        for step_metric, new_value in test_updates.items():
-            param_name = param_map.get(step_metric)
-            if param_name:
-                # Pattern to match: expected_num_queries3=120, (with flexible whitespace)
-                pattern = re.compile(
-                    rf"(def {re.escape(test_name)}\([^)]*\):.*?{re.escape(param_name)}\s*=\s*)(\d+)(\s*,)",
-                    re.DOTALL,
-                )
+        # Try to find _deduplication_performance call
+        perf_call_pattern_deduplication = re.compile(
+            r"(self\._deduplication_performance\s*\(\s*)"
+            r"expected_num_queries1\s*=\s*(\d+)\s*,\s*"
+            r"expected_num_async_tasks1\s*=\s*(\d+)\s*,\s*"
+            r"expected_num_queries2\s*=\s*(\d+)\s*,\s*"
+            r"expected_num_async_tasks2\s*=\s*(\d+)\s*,"
+            r"(\s*\))",
+            re.DOTALL,
+        )
 
-                def replace(match):
-                    return f"{match.group(1)}{new_value}{match.group(3)}"  # noqa: B023
+        perf_match = perf_call_pattern_import_reimport.search(test_method_content)
+        method_type = "import_reimport"
+        param_map = param_map_import_reimport
+        param_order = [
+            "import1_queries",
+            "import1_async_tasks",
+            "reimport1_queries",
+            "reimport1_async_tasks",
+            "reimport2_queries",
+            "reimport2_async_tasks",
+        ]
+
+        if not perf_match:
+            perf_match = perf_call_pattern_deduplication.search(test_method_content)
+            if perf_match:
+                method_type = "deduplication"
+                param_map = param_map_deduplication
+                param_order = [
+                    "first_import_queries",
+                    "first_import_async_tasks",
+                    "second_import_queries",
+                    "second_import_async_tasks",
+                ]
+            else:
+                print(f"⚠️  Warning: Could not find _import_reimport_performance or _deduplication_performance call in {test_name}")
+                continue
+
+        # Get the indentation from the original call (first line after opening paren)
+        call_lines = test_method_content[perf_match.start():perf_match.end()].split("\n")
+        indent = ""
+        for line in call_lines:
+            if "expected_num_queries1" in line:
+                # Extract indentation (spaces before the parameter)
+                indent_match = re.match(r"(\s*)expected_num_queries1", line)
+                if indent_match:
+                    indent = indent_match.group(1)
+                break
+
+        # If we couldn't find indentation, use a default
+        if not indent:
+            indent = "            "  # 12 spaces default
+
+        replacement_parts = [perf_match.group(1)]  # Opening: "self._import_reimport_performance("
+        updated_params = []
+        for i, step_metric in enumerate(param_order):
+            param_name = param_map[step_metric]
+            old_value = int(perf_match.group(i + 2))  # +2 because group 1 is the opening
+            if step_metric in test_updates:
+                new_value = test_updates[step_metric]
+                if old_value != new_value:
+                    updated_params.append(f"{param_name}: {old_value} → {new_value}")
+            else:
+                # Keep the existing value
+                new_value = old_value
+
+            replacement_parts.append(f"{indent}{param_name}={new_value},")
+
+        # Closing parenthesis - group number depends on method type
+        closing_group = 8 if method_type == "import_reimport" else 6
+        replacement_parts.append(perf_match.group(closing_group))  # Closing parenthesis
+        replacement = "\n".join(replacement_parts)
+
+        if updated_params:
+            print(f"    Updated: {', '.join(updated_params)}")
+
+        # Replace the method call within the test method content
+        updated_method_content = (
+            test_method_content[: perf_match.start()]
+            + replacement
+            + test_method_content[perf_match.end() :]
+        )
 
-                content = pattern.sub(replace, content)
+        # Replace the entire test method in the original content
+        content = content[:test_method_start] + updated_method_content + content[test_method_end:]
 
     # Write back to file
     TEST_FILE.write_text(content)
@@ -364,14 +579,31 @@ def main():
 
     if args.report_only:
         # Step 1: Run tests and generate report
-        output, return_code = run_tests(args.test_class)
-        success, error_msg = check_test_execution_success(output, return_code)
-        if not success:
-            print(f"\n❌ Test execution failed: {error_msg}")
+        # Run each test method individually
+        test_methods = extract_test_methods(args.test_class)
+        if not test_methods:
+            print(f"⚠️  No test methods found in {args.test_class}")
             sys.exit(1)
-        counts = parse_test_output(output)
+
+        print(f"\nFound {len(test_methods)} test method(s) in {args.test_class}")
+        print("=" * 80)
+
+        all_counts = []
+        for test_method in test_methods:
+            print(f"\n{'=' * 80}")
+            output, return_code = run_test_method(args.test_class, test_method)
+            success, error_msg = check_test_execution_success(output, return_code)
+            if not success:
+                print(f"\n⚠️  Test execution failed for {test_method}: {error_msg}")
+                print("Skipping this test method...")
+                continue
+
+            counts = parse_test_output(output)
+            if counts:
+                all_counts.extend(counts)
+
         expected_counts = extract_expected_counts_from_file(args.test_class)
-        generate_report(counts, expected_counts)
+        generate_report(all_counts, expected_counts)
 
     elif args.verify:
         # Step 3: Verify
@@ -380,18 +612,55 @@ def main():
 
     else:
         # Default: Update the file (--update is the default behavior)
-        output, return_code = run_tests(args.test_class)
-        success, error_msg = check_test_execution_success(output, return_code)
-        if not success:
-            print(f"\n❌ Test execution failed: {error_msg}")
-            print("Cannot update counts because tests did not run successfully.")
+        # Run each test method individually
+        test_methods = extract_test_methods(args.test_class)
+        if not test_methods:
+            print(f"⚠️  No test methods found in {args.test_class}")
             sys.exit(1)
-        counts = parse_test_output(output)
-        if counts:
-            update_test_file(counts)
+
+        print(f"\nFound {len(test_methods)} test method(s) in {args.test_class}")
+        print("=" * 80)
+
+        all_counts = []
+        for test_method in test_methods:
+            print(f"\n{'=' * 80}")
+            output, return_code = run_test_method(args.test_class, test_method)
+            success, error_msg = check_test_execution_success(output, return_code)
+            if not success:
+                print(f"\n⚠️  Test execution failed for {test_method}: {error_msg}")
+                print("Skipping this test method...")
+                continue
+
+            counts = parse_test_output(output)
+
+            # Check if test actually passed
+            test_passed = "OK" in output or ("Ran" in output and "FAILED" not in output and return_code == 0)
+
+            if counts:
+                all_counts.extend(counts)
+                # Update immediately after each test
+                update_test_file(counts)
+                print(f"⚠️  {test_method}: Found {len(counts)} count mismatch(es) - updated file")
+            elif test_passed:
+                print(f"✅ {test_method}: Test passed, all counts match")
+            elif return_code != 0:
+                # Test might have failed for other reasons
+                print(f"⚠️  {test_method}: Test failed (exit code {return_code}) but no count mismatches parsed")
+                print("   This might indicate a parsing issue or a different type of failure")
+                # Show a snippet of the output to help debug
+                fail_lines = [line for line in output.split("\n") if "FAIL" in line or "Error" in line or "Exception" in line]
+                if fail_lines:
+                    print("   Relevant error lines:")
+                    for line in fail_lines[:5]:
+                        print(f"     {line}")
+
+        if all_counts:
+            print(f"\n{'=' * 80}")
+            print(f"✅ Updated {len(all_counts)} count(s) across {len({c.test_name for c in all_counts})} test(s)")
             print("\nNext step: Run --verify to ensure all tests pass")
         else:
-            print("\nNo differences found. All tests are already up to date.")
+            print(f"\n{'=' * 80}")
+            print("\n✅ No differences found. All tests are already up to date.")
 
 
 if __name__ == "__main__":
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
index b2dc1c07192..1e7b05d8fe5 100644
--- a/unittests/test_importers_performance.py
+++ b/unittests/test_importers_performance.py
@@ -454,8 +454,9 @@ def test_deduplication_performance_pghistory_no_async(self):
         testuser.usercontactinfo.save()
 
         self._deduplication_performance(
-            expected_num_queries1=281,
+            expected_num_queries1=282,
             expected_num_async_tasks1=7,
-            expected_num_queries2=245,
+            expected_num_queries2=246,
             expected_num_async_tasks2=7,
+
         )