Import performance: reduce number of finding.save() calls (#12900)

valentijnscholten · web-flow · commit ff4b9a84703d · 2025-09-04T21:45:30.000+02:00
* test cases: fix caching of system settings

* fix tests

* fix caching for github

* fix caching for github

* simplify cache loading

* post process only when needed

* set tags on (re)import

* rebase set tags

* reduce save with options

* update counts, reduce saves with options

* importers: do not save again, but postprocess directly

* update counts

* optimize hash_code setting

* fix counts

* set hash code for new findings in reimport

* make smaller second save work

* make smaller second save work - add no_options

* update query counts

* improve we_want_async decorator

* test performance: force async

* fix async stuff in perf test

* fix async stuff in perf test

* fix async stuff in perf test

* update counts

* remove logging

* perf3b: compute hash_code on first save

* fix cve for reimport

* ruff

* fix no async

* Merge remote-tracking branch 'upstream/dev' into perf3-reduce-saves
diff --git a/dojo/decorators.py b/dojo/decorators.py
@@ -64,13 +64,17 @@ def we_want_async(*args, func=None, **kwargs):
         return False
 
     user = kwargs.get("async_user", get_current_user())
-    logger.debug("user: %s", user)
+    logger.debug("async user: %s", user)
+
+    if not user:
+        logger.debug("dojo_async_task %s: no current user, running task in the background", func)
+        return True
 
     if Dojo_User.wants_block_execution(user):
         logger.debug("dojo_async_task %s: running task in the foreground as block_execution is set to True for %s", func, user)
         return False
 
-    logger.debug("dojo_async_task %s: no current user, running task in the background", func)
+    logger.debug("dojo_async_task %s: running task in the background as user has not set block_execution to True for %s", func, user)
     return True
 
 
diff --git a/dojo/importers/base_importer.py b/dojo/importers/base_importer.py
@@ -682,14 +682,11 @@ def process_endpoints(
             logger.debug("endpoints_to_add: %s", endpoints_to_add)
             self.endpoint_manager.chunk_endpoints_and_disperse(finding, endpoints_to_add)
 
-    def process_vulnerability_ids(
+    def process_cve(
         self,
         finding: Finding,
     ) -> Finding:
-        """
-        Parse the `unsaved_vulnerability_ids` field from findings after they are parsed
-        to create `Vulnerability_Id` objects with the finding associated correctly
-        """
+        """Ensure cve is set from the unsaved_vulnerability_ids field, or vice versa."""
         # Synchronize the cve field with the unsaved_vulnerability_ids
         # We do this to be as flexible as possible to handle the fields until
         # the cve field is not needed anymore and can be removed.
@@ -703,6 +700,16 @@ def process_vulnerability_ids(
             # If there is no list, make one with the value of the cve field
             finding.unsaved_vulnerability_ids = [finding.cve]
 
+        return finding
+
+    def process_vulnerability_ids(
+        self,
+        finding: Finding,
+    ) -> Finding:
+        """
+        Parse the `unsaved_vulnerability_ids` field from findings after they are parsed
+        to create `Vulnerability_Id` objects with the finding associated correctly
+        """
         if finding.unsaved_vulnerability_ids:
             # Remove old vulnerability ids - keeping this call only because of flake8
             Vulnerability_Id.objects.filter(finding=finding).delete()
diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py
@@ -183,7 +183,7 @@ def process_findings(
             unsaved_finding.reporter = self.user
             unsaved_finding.last_reviewed_by = self.user
             unsaved_finding.last_reviewed = self.now
-            logger.debug("process_parsed_findings: active from report: %s, verified from report: %s", unsaved_finding.active, unsaved_finding.verified)
+            logger.debug("process_parsed_findings: unique_id_from_tool: %s, hash_code: %s, active from report: %s, verified from report: %s", unsaved_finding.unique_id_from_tool, unsaved_finding.hash_code, unsaved_finding.active, unsaved_finding.verified)
             # indicates an override. Otherwise, do not change the value of unsaved_finding.active
             if self.active is not None:
                 unsaved_finding.active = self.active
@@ -198,8 +198,13 @@ def process_findings(
 
             # Force parsers to use unsaved_tags (stored in below after saving)
             unsaved_finding.tags = None
-            # postprocessing will be done on next save.
+            finding = self.process_cve(unsaved_finding)
+            # Calculate hash_code before saving based on unsaved_endpoints and unsaved_vulnerability_ids
+            finding.set_hash_code(True)
+
+            # postprocessing will be done after processing related fields like endpoints, vulnerability ids, etc.
             unsaved_finding.save_no_options()
+
             finding = unsaved_finding
             # Determine how the finding should be grouped
             self.process_finding_groups(
@@ -218,11 +223,11 @@ def process_findings(
             finding = self.process_vulnerability_ids(finding)
             # Categorize this finding as a new one
             new_findings.append(finding)
+            # all data is already saved on the finding, we only need to trigger post processing
+
             # to avoid pushing a finding group multiple times, we push those outside of the loop
-            if self.findings_groups_enabled and self.group_by:
-                finding.save()
-            else:
-                finding.save(push_to_jira=self.push_to_jira)
+            push_to_jira = self.push_to_jira and (not self.findings_groups_enabled or not self.group_by)
+            finding_helper.post_process_finding_save(finding, dedupe_option=True, rules_option=True, product_grading_option=True, issue_updater_option=True, push_to_jira=push_to_jira)
 
         for (group_name, findings) in group_names_to_findings_dict.items():
             finding_helper.add_findings_to_auto_group(
diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
@@ -236,12 +236,11 @@ def process_findings(
                     finding,
                     unsaved_finding,
                 )
-                # finding = new finding or existing finding still in the upload report
+                # all data is already saved on the finding, we only need to trigger post processing
+
                 # to avoid pushing a finding group multiple times, we push those outside of the loop
-                if self.findings_groups_enabled and self.group_by:
-                    finding.save()
-                else:
-                    finding.save(push_to_jira=self.push_to_jira)
+                push_to_jira = self.push_to_jira and (not self.findings_groups_enabled or not self.group_by)
+                finding_helper.post_process_finding_save(finding, dedupe_option=True, rules_option=True, product_grading_option=True, issue_updater_option=True, push_to_jira=push_to_jira)
 
         self.to_mitigate = (set(self.original_items) - set(self.reactivated_items) - set(self.unchanged_items))
         # due to #3958 we can have duplicates inside the same report
@@ -336,11 +335,14 @@ def match_new_finding_to_existing_finding(
                 hash_code=unsaved_finding.hash_code,
             ).exclude(hash_code=None).order_by("id")
         if self.deduplication_algorithm == "unique_id_from_tool":
+            deduplicationLogger.debug(f"unique_id_from_tool: {unsaved_finding.unique_id_from_tool}")
             return Finding.objects.filter(
                 test=self.test,
                 unique_id_from_tool=unsaved_finding.unique_id_from_tool,
             ).exclude(unique_id_from_tool=None).order_by("id")
         if self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
+            deduplicationLogger.debug(f"unique_id_from_tool: {unsaved_finding.unique_id_from_tool}")
+            deduplicationLogger.debug(f"hash_code: {unsaved_finding.hash_code}")
             query = Finding.objects.filter(
                 Q(test=self.test),
                 (Q(hash_code__isnull=False) & Q(hash_code=unsaved_finding.hash_code))
@@ -500,10 +502,13 @@ def process_matched_mitigated_finding(
         if existing_finding.get_sla_configuration().restart_sla_on_reactivation:
             # restart the sla start date to the current date, finding.save() will set new sla_expiration_date
             existing_finding.sla_start_date = self.now
+        existing_finding = self.process_cve(existing_finding)
+        if existing_finding.get_sla_configuration().restart_sla_on_reactivation:
+            # restart the sla start date to the current date, finding.save() will set new sla_expiration_date
+            existing_finding.sla_start_date = self.now
+        # don't dedupe before endpoints are added, postprocessing will be done on next save (in calling method)
+        existing_finding.save_no_options()
 
-        existing_finding.save(dedupe_option=False)
-        # don't dedupe before endpoints are added
-        existing_finding.save(dedupe_option=False)
         note = Notes(entry=f"Re-activated by {self.scan_type} re-upload.", author=self.user)
         note.save()
         endpoint_statuses = existing_finding.status_finding.exclude(
@@ -551,6 +556,9 @@ def process_matched_active_finding(
                 existing_finding.active = False
                 if self.verified is not None:
                     existing_finding.verified = self.verified
+                existing_finding = self.process_cve(existing_finding)
+                existing_finding.save_no_options()
+
             elif unsaved_finding.risk_accepted or unsaved_finding.false_p or unsaved_finding.out_of_scope:
                 logger.debug("Reimported mitigated item matches a finding that is currently open, closing.")
                 logger.debug(
@@ -563,6 +571,8 @@ def process_matched_active_finding(
                 existing_finding.active = False
                 if self.verified is not None:
                     existing_finding.verified = self.verified
+                existing_finding = self.process_cve(existing_finding)
+                existing_finding.save_no_options()
             else:
                 # if finding is the same but list of affected was changed, finding is marked as unchanged. This is a known issue
                 self.unchanged_items.append(existing_finding)
@@ -597,6 +607,8 @@ def process_finding_that_was_not_matched(
         # scan_date was provided, override value from parser
         if self.scan_date_override:
             unsaved_finding.date = self.scan_date.date()
+        unsaved_finding = self.process_cve(unsaved_finding)
+        # Hash code is already calculated earlier as it's the primary matching criteria for reimport
         # Save it. Don't dedupe before endpoints are added.
         unsaved_finding.save_no_options()
         finding = unsaved_finding
@@ -640,7 +652,7 @@ def finding_post_processing(
         # Process vulnerability IDs
         if finding_from_report.unsaved_vulnerability_ids:
             finding.unsaved_vulnerability_ids = finding_from_report.unsaved_vulnerability_ids
-
+        # legacy cve field has already been processed/set earlier
         return self.process_vulnerability_ids(finding)
 
     def process_groups_for_all_findings(
diff --git a/unittests/test_import_reimport.py b/unittests/test_import_reimport.py
@@ -658,7 +658,8 @@ def test_import_veracode_reimport_veracode_same_hash_code_different_unique_id(se
     def test_import_veracode_reimport_veracode_same_unique_id_different_hash_code(self):
         logger.debug("reimporting report with one finding having same unique_id_from_tool but different hash_code, verified=False")
 
-        import_veracode_many_findings = self.import_scan_with_params(self.veracode_many_findings, scan_type=self.scan_type_veracode)
+        with assertTestImportModelsCreated(self, imports=1, created=4, affected_findings=4, closed=0, reactivated=0, untouched=0):
+            import_veracode_many_findings = self.import_scan_with_params(self.veracode_many_findings, scan_type=self.scan_type_veracode)
 
         test_id = import_veracode_many_findings["test"]
 
@@ -1748,6 +1749,9 @@ def __init__(self, *args, **kwargs):
 
     def setUp(self):
         testuser = User.objects.get(username="admin")
+        testuser.usercontactinfo.block_execution = True
+        testuser.usercontactinfo.save()
+
         token = Token.objects.get(user=testuser)
         self.client = APIClient()
         self.client.credentials(HTTP_AUTHORIZATION="Token " + token.key)
@@ -2023,6 +2027,9 @@ def __init__(self, *args, **kwargs):
     def setUp(self):
         # still using the API to verify results
         testuser = User.objects.get(username="admin")
+        testuser.usercontactinfo.block_execution = True
+        testuser.usercontactinfo.save()
+
         token = Token.objects.get(user=testuser)
         self.client = APIClient()
         self.client.credentials(HTTP_AUTHORIZATION="Token " + token.key)
diff --git a/unittests/test_importers_performance.py b/unittests/test_importers_performance.py
@@ -1,6 +1,5 @@
 import logging
 from contextlib import contextmanager
-from unittest.mock import patch
 
 from crum import impersonate
 from django.contrib.contenttypes.models import ContentType
@@ -20,12 +19,15 @@
     Product_Type,
     Test,
     User,
+    UserContactInfo,
 )
 
 from .dojo_test_case import DojoTestCase, get_unit_tests_scans_path
 
+logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
+
 STACK_HAWK_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings.json"
 STACK_HAWK_SUBSET_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings_subset.json"
 STACK_HAWK_SCAN_TYPE = "StackHawk HawkScan"
@@ -39,6 +41,9 @@ class TestDojoImporterPerformance(DojoTestCase):
     def setUp(self):
         super().setUp()
 
+        testuser = User.objects.create(username="admin")
+        UserContactInfo.objects.create(user=testuser, block_execution=False)
+
         self.system_settings(enable_webhooks_notifications=False)
         self.system_settings(enable_product_grade=False)
         self.system_settings(enable_github=False)
@@ -67,6 +72,14 @@ def assertNumAsyncTask(self, num):
             )
             raise self.failureException(msg)
 
+        tasks = dojo_async_task_counter.get_tasks()
+        tasks_str = "\n".join(str(task) for task in tasks)
+        msg = (
+            f"Correct number of {num} celery tasks were created.\n"
+            f"Tasks created:\n{tasks_str}"
+        )
+        logger.debug(msg)
+
     def import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3):
         """
         Log output can be quite large as when the assertNumQueries fails, all queries are printed.
@@ -158,53 +171,61 @@ def import_reimport_performance(self, expected_num_queries1, expected_num_async_
             reimporter = DefaultReImporter(**reimport_options)
             test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan)
 
-    def test_import_reimport_reimport_performance(self):
+    # patch the we_want_async decorator to always return True so we don't depend on block_execution flag shenanigans
+    # @patch("dojo.decorators.we_want_async", return_value=True)
+    # def test_import_reimport_reimport_performance_async(self, mock):
+    def test_import_reimport_reimport_performance_async(self):
         self.import_reimport_performance(
-            expected_num_queries1=712,
+            expected_num_queries1=682,
             expected_num_async_tasks1=10,
-            expected_num_queries2=656,
+            expected_num_queries2=610,
             expected_num_async_tasks2=22,
-            expected_num_queries3=332,
+            expected_num_queries3=292,
             expected_num_async_tasks3=20,
         )
 
-    @patch("dojo.decorators.we_want_async", return_value=False)
-    def test_import_reimport_reimport_performance_no_async(self, mock):
+    # @patch("dojo.decorators.we_want_async", return_value=False)
+    # def test_import_reimport_reimport_performance_no_async(self, mock):
+    def test_import_reimport_reimport_performance_no_async(self):
         """
         This test checks the performance of the importers when they are run in sync mode.
         The reason for this is that we also want to be aware of when a PR affects the number of queries
         or async tasks created by a background task.
         The impersonate context manager above does not work as expected for disabling async,
         so we patch the we_want_async decorator to always return False.
         """
+        testuser = User.objects.get(username="admin")
+        testuser.usercontactinfo.block_execution = True
+        testuser.usercontactinfo.save()
         self.import_reimport_performance(
-            expected_num_queries1=712,
+            expected_num_queries1=682,
             expected_num_async_tasks1=10,
-            expected_num_queries2=656,
+            expected_num_queries2=615,
             expected_num_async_tasks2=22,
-            expected_num_queries3=332,
+            expected_num_queries3=297,
             expected_num_async_tasks3=20,
         )
 
-    @patch("dojo.decorators.we_want_async", return_value=False)
-    def test_import_reimport_reimport_performance_no_async_with_product_grading(self, mock):
+    # @patch("dojo.decorators.we_want_async", return_value=False)
+    # def test_import_reimport_reimport_performance_no_async_with_product_grading(self, mock):
+    def test_import_reimport_reimport_performance_no_async_with_product_grading(self):
         """
         This test checks the performance of the importers when they are run in sync mode.
         The reason for this is that we also want to be aware of when a PR affects the number of queries
         or async tasks created by a background task.
         The impersonate context manager above does not work as expected for disabling async,
         so we patch the we_want_async decorator to always return False.
         """
+        testuser = User.objects.get(username="admin")
+        testuser.usercontactinfo.block_execution = True
+        testuser.usercontactinfo.save()
         self.system_settings(enable_product_grade=True)
-        # Refresh the cache with the new settings
-        from dojo.middleware import DojoSytemSettingsMiddleware
-        DojoSytemSettingsMiddleware.load()
 
         self.import_reimport_performance(
-            expected_num_queries1=732,
+            expected_num_queries1=702,
             expected_num_async_tasks1=15,
-            expected_num_queries2=686,
+            expected_num_queries2=645,
             expected_num_async_tasks2=28,
-            expected_num_queries3=357,
+            expected_num_queries3=322,
             expected_num_async_tasks3=25,
         )