switch back to chords

valentijnscholten · valentijnscholten · commit 87e5d457c2bc · 2025-09-14T22:19:48.000+02:00
diff --git a/dojo/importers/default_importer.py b/dojo/importers/default_importer.py
@@ -1,11 +1,13 @@
 import logging
 
+from celery import chord
 from django.core.files.uploadedfile import TemporaryUploadedFile
 from django.core.serializers import serialize
 from django.db.models.query_utils import Q
 from django.urls import reverse
 
 import dojo.jira_link.helper as jira_helper
+from dojo import utils
 from dojo.decorators import we_want_async
 from dojo.finding import helper as finding_helper
 from dojo.importers.base_importer import BaseImporter, Parser
@@ -17,7 +19,6 @@
     Test_Import,
 )
 from dojo.notifications.helper import create_notification
-from dojo.tasks import wait_for_tasks_and_calculate_grade
 from dojo.utils import calculate_grade
 from dojo.validators import clean_tags
 
@@ -158,7 +159,11 @@ def process_findings(
         parsed_findings: list[Finding],
         **kwargs: dict,
     ) -> list[Finding]:
-        async_task_ids = []
+        # Progressive batching for chord execution
+        post_processing_task_signatures = []
+        current_batch_number = 1
+        max_batch_size = 1024
+        pending_grade_calculations = []
 
         """
         Saves findings in memory that were parsed from the scan report into the database.
@@ -248,10 +253,25 @@ def process_findings(
                 push_to_jira=push_to_jira,
             )
 
-            # We need to call apply_async to get the result of the task so we can collect the task ID
             if we_want_async(async_user=self.user):
-                result = post_processing_task_signature.apply_async()
-                async_task_ids.append(result.id)
+                # Collect signatures for progressive batch execution
+                post_processing_task_signatures.append(post_processing_task_signature)
+
+                # Calculate current batch size: 2^batch_number, capped at max_batch_size
+                current_batch_size = min(2 ** current_batch_number, max_batch_size)
+
+                # Launch chord when batch is full
+                if len(post_processing_task_signatures) >= current_batch_size:
+                    product = self.test.engagement.product
+                    calculate_grade_signature = utils.calculate_grade_signature(product)
+                    chord_result = chord(post_processing_task_signatures)(calculate_grade_signature)
+                    pending_grade_calculations.append(chord_result)
+
+                    logger.debug(f"Launched chord with {len(post_processing_task_signatures)} tasks (batch #{current_batch_number}, size: {current_batch_size})")
+
+                    # Reset for next batch
+                    post_processing_task_signatures = []
+                    current_batch_number += 1
             else:
                 # Execute task immediately for synchronous processing
                 post_processing_task_signature()
@@ -270,14 +290,18 @@ def process_findings(
                 else:
                     jira_helper.push_to_jira(findings[0])
 
-        # Calculate product grade after all findings are processed
+        # Handle any remaining signatures in the final batch
         product = self.test.engagement.product
 
-        if we_want_async(async_user=self.user) and async_task_ids:
-            # Tasks were executed immediately during processing, now coordinate final grade calculation
-            wait_for_tasks_and_calculate_grade.delay(async_task_ids, product.id)
+        if we_want_async(async_user=self.user):
+            if post_processing_task_signatures:
+                # Launch final chord with remaining signatures
+                calculate_grade_signature = utils.calculate_grade_signature(product)
+                chord_result = chord(post_processing_task_signatures)(calculate_grade_signature)
+                pending_grade_calculations.append(chord_result)
+                logger.debug(f"Launched final chord with {len(post_processing_task_signatures)} remaining tasks")
 
-        # Synchronous tasks were already executed during processing, just calculate grade
+        # Always perform an initial grading, even though it might get overwritten alter.
         calculate_grade(product)
 
         sync = kwargs.get("sync", True)
diff --git a/dojo/importers/default_reimporter.py b/dojo/importers/default_reimporter.py
@@ -1,11 +1,13 @@
 import logging
 
+from celery import chord
 from django.core.files.uploadedfile import TemporaryUploadedFile
 from django.core.serializers import serialize
 from django.db.models.query_utils import Q
 
 import dojo.finding.helper as finding_helper
 import dojo.jira_link.helper as jira_helper
+from dojo import utils
 from dojo.decorators import we_want_async
 from dojo.importers.base_importer import BaseImporter, Parser
 from dojo.importers.options import ImporterOptions
@@ -16,7 +18,6 @@
     Test,
     Test_Import,
 )
-from dojo.tasks import wait_for_tasks_and_calculate_grade
 from dojo.utils import calculate_grade
 from dojo.validators import clean_tags
 
@@ -179,7 +180,11 @@ def process_findings(
         self.reactivated_items = []
         self.unchanged_items = []
         self.group_names_to_findings_dict = {}
-        async_task_ids = []
+        # Progressive batching for chord execution
+        post_processing_task_signatures = []
+        current_batch_number = 1
+        max_batch_size = 1024
+        pending_grade_calculations = []
 
         logger.debug(f"starting reimport of {len(parsed_findings) if parsed_findings else 0} items.")
         logger.debug("STEP 1: looping over findings from the reimported report and trying to match them to existing findings")
@@ -254,9 +259,24 @@ def process_findings(
                     push_to_jira=push_to_jira,
                 )
                 if we_want_async(async_user=self.user):
-                    # Execute task immediately and collect task ID
-                    result = post_processing_task_signature.apply_async()
-                    async_task_ids.append(result.id)
+                    # Collect signatures for progressive batch execution
+                    post_processing_task_signatures.append(post_processing_task_signature)
+
+                    # Calculate current batch size: 2^batch_number, capped at max_batch_size
+                    current_batch_size = min(2 ** current_batch_number, max_batch_size)
+
+                    # Launch chord when batch is full
+                    if len(post_processing_task_signatures) >= current_batch_size:
+                        product = self.test.engagement.product
+                        calculate_grade_signature = utils.calculate_grade_signature(product)
+                        chord_result = chord(post_processing_task_signatures)(calculate_grade_signature)
+                        pending_grade_calculations.append(chord_result)
+
+                        logger.debug(f"Launched chord with {len(post_processing_task_signatures)} tasks (batch #{current_batch_number}, size: {current_batch_size})")
+
+                        # Reset for next batch
+                        post_processing_task_signatures = []
+                        current_batch_number += 1
                 else:
                     # Execute task immediately for synchronous processing
                     post_processing_task_signature()
@@ -272,12 +292,17 @@ def process_findings(
         # Process groups
         self.process_groups_for_all_findings(**kwargs)
 
-        # Calculate product grade once after all findings are processed
+        # Handle any remaining signatures in the final batch
         product = self.test.engagement.product
 
-        if we_want_async(async_user=self.user) and async_task_ids:
-            # Tasks were executed immediately during processing, now coordinate final grade calculation
-            wait_for_tasks_and_calculate_grade.delay(async_task_ids, product.id)
+        if we_want_async(async_user=self.user):
+            if post_processing_task_signatures:
+                # Launch final chord with remaining signatures
+                calculate_grade_signature = utils.calculate_grade_signature(product)
+                chord_result = chord(post_processing_task_signatures)(calculate_grade_signature)
+                pending_grade_calculations.append(chord_result)
+                logger.debug(f"Launched final chord with {len(post_processing_task_signatures)} remaining tasks")
+
         # Synchronous tasks were already executed during processing, just calculate grade
         calculate_grade(product)
 
diff --git a/dojo/tasks.py b/dojo/tasks.py
@@ -2,7 +2,6 @@
 from datetime import date, timedelta
 
 from auditlog.models import LogEntry
-from celery.result import AsyncResult
 from celery.utils.log import get_task_logger
 from dateutil.relativedelta import relativedelta
 from django.conf import settings
@@ -193,36 +192,6 @@ def fix_loop_duplicates_task(*args, **kwargs):
     return fix_loop_duplicates()
 
 
-@app.task
-def wait_for_tasks_and_calculate_grade(task_ids, product_id, *args, **kwargs):
-    """
-    Wait for all specified tasks to complete, then calculate product grade.
-    This provides coordination for immediate task execution without using chord.
-    """
-    logger.info(f"Waiting for {len(task_ids)} tasks to complete before calculating grade for product {product_id}")
-
-    # Wait for all tasks to complete
-    results = [AsyncResult(task_id) for task_id in task_ids]
-
-    # This will block until all tasks are done
-    for result in results:
-        try:
-            result.get(timeout=300)  # 5 minute timeout per task
-        except Exception as e:
-            logger.warning(f"Task {result.id} failed: {e}")
-            # Continue waiting for other tasks even if one fails
-
-    # All tasks completed, now calculate grade
-    try:
-        product = Product.objects.get(id=product_id)
-        logger.info(f"All post-processing tasks completed, calculating grade for product {product.name}")
-        calculate_grade(product)
-    except Product.DoesNotExist:
-        logger.error(f"Product {product_id} not found for grade calculation")
-    except Exception as e:
-        logger.error(f"Error calculating grade for product {product_id}: {e}")
-
-
 @app.task
 def evaluate_pro_proposition(*args, **kwargs):
     # Ensure we should be doing this