Import history optimize (#13182)

valentijnscholten · web-flow · commit 57690b9bfd57 · 2025-09-29T19:11:40.000+02:00
* import history optimization

* debugtoolbar: fix for requirements-dev.txt

* import history optimization

* fix fallback

* add bulk existence check

* add unit test file

* update query counts

* try without keepdb

* clean database between test runs

* clean database between test runs

* remove keepdb

* fix dojo_testdata fixture

* restore entrypoint

* revert testdata changes, run transactionaltest at the end

* fix entrypoint

* fix excludes

* add comments

* update counts
diff --git a/docker/entrypoint-unit-tests.sh b/docker/entrypoint-unit-tests.sh
@@ -80,9 +80,16 @@ echo "Unit Tests"
 echo "------------------------------------------------------------"
 
 # Removing parallel and shuffle for now to maintain stability
-python3 manage.py test unittests -v 3 --keepdb --no-input --exclude-tag="non-parallel" || {
+python3 manage.py test unittests -v 3 --keepdb --no-input --exclude-tag="non-parallel" --exclude-tag="transactional" || {
     exit 1;
 }
 python3 manage.py test unittests -v 3 --keepdb --no-input --tag="non-parallel" || {
     exit 1;
-}
+}
+# Running one unit tests that inherits from TransactionTestCase somehow changes the behaviour of how Django loads fixtures into the database.
+# Meaning any test after this one would fail to load our dojo_testdata.json fixture. In a way this makes sense as it contains some data integrity problems.
+# I tried to fix these in https://github.com/DefectDojo/django-DefectDojo/pull/13217.
+# For now here we run the only TranscationTestCase at the end to avoid the problem.
+python3 manage.py test unittests -v 3 --keepdb --no-input --tag="transactional" || {
+    exit 1;
+}
diff --git a/dojo/finding/helper.py b/dojo/finding/helper.py
@@ -166,6 +166,24 @@ def update_finding_status(new_state_finding, user, changed_fields=None):
     new_state_finding.last_status_update = now
 
 
+def filter_findings_by_existence(findings):
+    """
+    Return only findings that still exist in the database (by id).
+
+    Centralized helper used by importers to avoid FK violations during
+    bulk_create.
+    """
+    if not findings:
+        return []
+    candidate_ids = [finding.id for finding in findings if getattr(finding, "id", None)]
+    if not candidate_ids:
+        return []
+    existing_ids = set(
+        Finding.objects.filter(id__in=candidate_ids).values_list("id", flat=True),
+    )
+    return [finding for finding in findings if finding.id in existing_ids]
+
+
 def can_edit_mitigated_data(user):
     return settings.EDITABLE_MITIGATED_DATA and user.is_superuser
 
diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py
@@ -333,6 +333,9 @@ def update_import_history(
     ) -> Test_Import:
         """Creates a record of the import or reimport operation that has occurred."""
         # Quick fail check to determine if we even wanted this
+        if settings.TRACK_IMPORT_HISTORY is False:
+            return None
+
         if untouched_findings is None:
             untouched_findings = []
         if reactivated_findings is None:
@@ -341,8 +344,6 @@ def update_import_history(
             closed_findings = []
         if new_findings is None:
             new_findings = []
-        if settings.TRACK_IMPORT_HISTORY is False:
-            return None
         # Log the current state of what has occurred in case there could be
         # deviation from what is displayed in the view
         logger.debug(
@@ -374,30 +375,49 @@ def update_import_history(
         )
 
         # Create a history record for each finding
-        for finding in closed_findings:
-            self.create_import_history_record_safe(Test_Import_Finding_Action(
-                test_import=test_import,
-                finding=finding,
-                action=IMPORT_CLOSED_FINDING,
-            ))
-        for finding in new_findings:
-            self.create_import_history_record_safe(Test_Import_Finding_Action(
-                test_import=test_import,
-                finding=finding,
-                action=IMPORT_CREATED_FINDING,
-            ))
-        for finding in reactivated_findings:
-            self.create_import_history_record_safe(Test_Import_Finding_Action(
-                test_import=test_import,
-                finding=finding,
-                action=IMPORT_REACTIVATED_FINDING,
-            ))
-        for finding in untouched_findings:
-            self.create_import_history_record_safe(Test_Import_Finding_Action(
-                test_import=test_import,
-                finding=finding,
-                action=IMPORT_UNTOUCHED_FINDING,
-            ))
+        finding_action_mappings = [
+            (closed_findings, IMPORT_CLOSED_FINDING),
+            (new_findings, IMPORT_CREATED_FINDING),
+            (reactivated_findings, IMPORT_REACTIVATED_FINDING),
+            (untouched_findings, IMPORT_UNTOUCHED_FINDING),
+        ]
+
+        # In longer running imports it can happen that the async_dupe_delete task removes a finding before the history record is created
+        # We filter out these findings here to avoid FK violations (IntegrityError)
+        all_findings = []
+        for _list, _ in finding_action_mappings:
+            all_findings.extend(_list)
+        existing_findings = finding_helper.filter_findings_by_existence(all_findings) if all_findings else []
+        existing_ids = {f.id for f in existing_findings}
+
+        # Collect all import history records using the validated IDs
+        import_history_records = []
+        for findings, action in finding_action_mappings:
+            import_history_records.extend(
+                Test_Import_Finding_Action(
+                    test_import=test_import,
+                    finding_id=finding.id,
+                    action=action,
+                )
+                for finding in findings
+                if finding.id in existing_ids
+            )
+
+        # Bulk create all at once and let Django handle batching internally.
+        # Still in even more rare cases a finding can be deleted once we arrive here.
+        # If any integrity error occurs, fall back to inserting all records individually.
+        # The bulk_create is atomic so all batches will succeed or all will fail/rollback
+        try:
+            # keep bulk failure contained so fallback can proceed in TestCase transaction
+            Test_Import_Finding_Action.objects.bulk_create(
+                import_history_records,
+                ignore_conflicts=True,
+                batch_size=100,
+            )
+        except IntegrityError:
+            logger.warning("IntegrityError occurred while bulk creating Test_Import_Finding_Actions, falling back to individual inserts")
+            for record in import_history_records:
+                self.create_import_history_record_safe(record)
 
         # Add any tags to the findings imported if necessary
         if self.apply_tags_to_findings and self.tags:
@@ -418,10 +438,10 @@ def create_import_history_record_safe(
         test_import_finding_action,
     ):
         """Creates an import history record, while catching any IntegrityErrors that might happen because of the background job having deleted a finding"""
-        logger.debug(f"creating Test_Import_Finding_Action for finding: {test_import_finding_action.finding.id} action: {test_import_finding_action.action}")
+        logger.debug(f"creating Test_Import_Finding_Action for finding_id: {test_import_finding_action.finding_id} action: {test_import_finding_action.action}")
         try:
             test_import_finding_action.save()
-        except IntegrityError as e:
+        except (IntegrityError, ValueError) as e:
             # This try catch makes us look we don't know what we're doing, but in https://github.com/DefectDojo/django-DefectDojo/issues/6217 we decided that for now this is the best solution
             logger.warning("Error creating Test_Import_Finding_Action: %s", e)
             logger.debug("Error creating Test_Import_Finding_Action, finding marked as duplicate and deleted ?")
diff --git a/dojo/settings/settings.dist.py b/dojo/settings/settings.dist.py
@@ -30,6 +30,10 @@
     DD_SITE_URL=(str, "http://localhost:8080"),
     DD_DEBUG=(bool, False),
     DD_DJANGO_DEBUG_TOOLBAR_ENABLED=(bool, False),
+    # django-auditlog imports django-jsonfield-backport raises a warning that can be ignored,
+    # see https://github.com/laymonage/django-jsonfield-backport
+    # debug_toolbar.E001 is raised when running tests in dev mode via run-unittests.sh
+    DD_SILENCED_SYSTEM_CHECKS=(list, ["debug_toolbar.E001", "django_jsonfield_backport.W001"]),
     DD_TEMPLATE_DEBUG=(bool, False),
     DD_LOG_LEVEL=(str, ""),
     DD_DJANGO_METRICS_ENABLED=(bool, False),
@@ -740,6 +744,7 @@ def generate_url(scheme, double_slashes, user, password, host, port, path, param
 # Override default Django behavior for incorrect URLs
 APPEND_SLASH = env("DD_APPEND_SLASH")
 
+
 # Whether to use a secure cookie for the CSRF cookie.
 CSRF_COOKIE_SECURE = env("DD_CSRF_COOKIE_SECURE")
 CSRF_COOKIE_SAMESITE = env("DD_CSRF_COOKIE_SAMESITE")
@@ -1814,9 +1819,7 @@ def saml2_attrib_map_format(din):
 # for very large objects
 DELETE_PREVIEW = env("DD_DELETE_PREVIEW")
 
-# django-auditlog imports django-jsonfield-backport raises a warning that can be ignored,
-# see https://github.com/laymonage/django-jsonfield-backport
-SILENCED_SYSTEM_CHECKS = ["django_jsonfield_backport.W001"]
+SILENCED_SYSTEM_CHECKS = env("DD_SILENCED_SYSTEM_CHECKS")
 
 VULNERABILITY_URLS = {
     "ALAS": "https://alas.aws.amazon.com/AL2/&&.html",  # e.g. https://alas.aws.amazon.com/alas2.html
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
@@ -176,11 +176,11 @@ def import_reimport_performance(self, expected_num_queries1, expected_num_async_
     # def test_import_reimport_reimport_performance_async(self, mock):
     def test_import_reimport_reimport_performance_async(self):
         self.import_reimport_performance(
-            expected_num_queries1=682,
+            expected_num_queries1=679,
             expected_num_async_tasks1=10,
-            expected_num_queries2=610,
+            expected_num_queries2=606,
             expected_num_async_tasks2=22,
-            expected_num_queries3=292,
+            expected_num_queries3=289,
             expected_num_async_tasks3=20,
         )
 
@@ -198,11 +198,11 @@ def test_import_reimport_reimport_performance_no_async(self):
         testuser.usercontactinfo.block_execution = True
         testuser.usercontactinfo.save()
         self.import_reimport_performance(
-            expected_num_queries1=682,
+            expected_num_queries1=679,
             expected_num_async_tasks1=10,
-            expected_num_queries2=615,
+            expected_num_queries2=611,
             expected_num_async_tasks2=22,
-            expected_num_queries3=297,
+            expected_num_queries3=294,
             expected_num_async_tasks3=20,
         )
 
@@ -222,10 +222,10 @@ def test_import_reimport_reimport_performance_no_async_with_product_grading(self
         self.system_settings(enable_product_grade=True)
 
         self.import_reimport_performance(
-            expected_num_queries1=687,
+            expected_num_queries1=684,
             expected_num_async_tasks1=15,
-            expected_num_queries2=621,
+            expected_num_queries2=617,
             expected_num_async_tasks2=28,
-            expected_num_queries3=302,
+            expected_num_queries3=299,
             expected_num_async_tasks3=25,
         )
diff --git a/unittests/test_update_import_history.py b/unittests/test_update_import_history.py
@@ -0,0 +1,150 @@
+import logging
+from unittest.mock import patch
+
+from django.contrib.auth.models import User as DjangoUser
+from django.test import TransactionTestCase, tag
+from django.utils import timezone
+
+from dojo.importers.default_importer import DefaultImporter
+from dojo.models import (
+    Development_Environment,
+    Engagement,
+    Finding,
+    Product,
+    Product_Type,
+    SLA_Configuration,
+    Test,
+    Test_Import_Finding_Action,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# we need to run this as a TransactionTestCase to be able to mimic the behavior of the bulk_create fallback at runtime when a FK violation occurs
+
+
+@tag("transactional")
+class UpdateImportHistoryTests(TransactionTestCase):
+
+    # loading fixtures fails in TransactionTestCase, not sure why. possibly because they are not up-to-date and missing fields like sla_configuration
+    # creating testdata via code is a better approach, at least here.
+    def setUp(self):
+        super().setUp()
+        self.env, _ = Development_Environment.objects.get_or_create(name="Development")
+        self.prod_type = Product_Type.objects.create(name="UpdateImportHistory PT")
+        # Ensure a valid SLA configuration exists and is assigned explicitly to avoid default FK issues
+        self.sla = SLA_Configuration.objects.create(name="UpdateImportHistory SLA")
+        self.prod = Product.objects.create(
+            name="UpdateImportHistory P",
+            prod_type=self.prod_type,
+            sla_configuration=self.sla,
+        )
+        self.eng = Engagement.objects.create(
+            name="UpdateImportHistory E",
+            product=self.prod,
+            target_start=timezone.now(),
+            target_end=timezone.now(),
+        )
+        # Ensure a reporter/lead user exists for FK constraints
+        self.user = DjangoUser.objects.create(username="admin")
+
+        # Minimal importer
+        self.importer = DefaultImporter(
+            user=self.user,
+            lead=self.user,
+            environment=self.env,
+            engagement=self.eng,
+            minimum_severity="Info",
+            active=True,
+            verified=True,
+            sync=True,
+            scan_type="StackHawk HawkScan",
+        )
+        # Explicitly create the Test similar to Engagement creation
+        self.test = Test.objects.create(
+            title="UpdateImportHistory T",
+            engagement=self.eng,
+            lead=self.user,
+            environment=self.env,
+            test_type=self.importer.get_or_create_test_type("StackHawk HawkScan"),
+            scan_type="StackHawk HawkScan",
+            target_start=timezone.now(),
+            target_end=timezone.now(),
+            percent_complete=0,
+        )
+        # Attach to importer
+        self.importer.test = self.test
+
+    def _create_findings(self, count):
+        findings = []
+        for i in range(count):
+            f = Finding(
+                title=f"F{i}",
+                test=self.importer.test,
+                severity="Low",
+                reporter=self.user,
+            )
+            f.save()
+            findings.append(f)
+        return findings
+
+    def test_success_path_creates_expected_actions(self):
+        new_findings = self._create_findings(5)
+        closed_findings = self._create_findings(3)
+
+        test_import = self.importer.update_import_history(
+            new_findings=new_findings,
+            closed_findings=closed_findings,
+        )
+
+        total_expected = len(new_findings) + len(closed_findings)
+        created = Test_Import_Finding_Action.objects.filter(test_import=test_import).count()
+        self.assertEqual(created, total_expected)
+
+    def test_fk_violation_in_batch_results_in_partial_fallback(self):
+        # One bad finding (deleted after pre-check) triggers IntegrityError; fallback saves the valid ones
+        new_findings = self._create_findings(9)
+        bad = self._create_findings(1)[0]
+
+        # Patch the existence filter to return all findings as-if they exist, then delete to simulate race after check
+        with patch("dojo.finding.helper.filter_findings_by_existence", side_effect=lambda lst: lst):
+            bad_id = bad.id
+            Finding.objects.filter(id=bad_id).delete()
+            test_import = self.importer.update_import_history(new_findings=[*new_findings, bad])
+
+        created = Test_Import_Finding_Action.objects.filter(test_import=test_import).count()
+        # Expect only the 9 valid ones to be created; the bad one is skipped/raises during fallback
+        self.assertEqual(created, len(new_findings))
+
+    def test_fk_violation_second_batch_results_in_partial_fallback(self):
+        # Create 300 findings so Django's bulk_create will batch internally (batch_size=100)
+        total = 300
+        new_findings = self._create_findings(total)
+
+        # Delete a finding in the second batch (index 150) after the existence check
+        bad = new_findings[150]
+        with patch("dojo.finding.helper.filter_findings_by_existence", side_effect=lambda lst: lst):
+            Finding.objects.filter(id=bad.id).delete()
+            test_import = self.importer.update_import_history(new_findings=new_findings)
+
+        # Expect all but the deleted one to be created via fallback
+        created = Test_Import_Finding_Action.objects.filter(test_import=test_import).count()
+        self.assertEqual(created, total - 1)
+
+    def test_precheck_filters_out_deleted_findings_allows_successful_bulk(self):
+        # If a finding is deleted before the existence check, it should be filtered out
+        new_findings = self._create_findings(5)
+        closed_findings = self._create_findings(3)
+
+        # Delete one from new and one from closed before calling update_import_history
+        Finding.objects.filter(id=new_findings[0].id).delete()
+        Finding.objects.filter(id=closed_findings[0].id).delete()
+
+        test_import = self.importer.update_import_history(
+            new_findings=new_findings,
+            closed_findings=closed_findings,
+        )
+
+        expected = (len(new_findings) - 1) + (len(closed_findings) - 1)
+        created = Test_Import_Finding_Action.objects.filter(test_import=test_import).count()
+        self.assertEqual(created, expected)