Skip to content

Commit ff4b9a8

Browse files
Import performance: reduce number of finding.save() calls (#12900)
* test cases: fix caching of system settings * fix tests * fix caching for github * fix caching for github * simplify cache loading * post process only when needed * set tags on (re)import * rebase set tags * reduce save with options * update counts, reduce saves with options * importers: do not save again, but postprocess directly * update counts * optimize hash_code setting * fix counts * set hash code for new findings in reimport * make smaller second save work * make smaller second save work - add no_options * update query counts * improve we_want_async decorator * test performance: force async * fix async stuff in perf test * fix async stuff in perf test * fix async stuff in perf test * update counts * remove logging * perf3b: compute hash_code on first save * fix cve for reimport * ruff * fix no async * Merge remote-tracking branch 'upstream/dev' into perf3-reduce-saves
1 parent 7db9e6b commit ff4b9a8

6 files changed

Lines changed: 97 additions & 41 deletions

File tree

dojo/decorators.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,17 @@ def we_want_async(*args, func=None, **kwargs):
6464
return False
6565

6666
user = kwargs.get("async_user", get_current_user())
67-
logger.debug("user: %s", user)
67+
logger.debug("async user: %s", user)
68+
69+
if not user:
70+
logger.debug("dojo_async_task %s: no current user, running task in the background", func)
71+
return True
6872

6973
if Dojo_User.wants_block_execution(user):
7074
logger.debug("dojo_async_task %s: running task in the foreground as block_execution is set to True for %s", func, user)
7175
return False
7276

73-
logger.debug("dojo_async_task %s: no current user, running task in the background", func)
77+
logger.debug("dojo_async_task %s: running task in the background as user has not set block_execution to True for %s", func, user)
7478
return True
7579

7680

dojo/importers/base_importer.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -682,14 +682,11 @@ def process_endpoints(
682682
logger.debug("endpoints_to_add: %s", endpoints_to_add)
683683
self.endpoint_manager.chunk_endpoints_and_disperse(finding, endpoints_to_add)
684684

685-
def process_vulnerability_ids(
685+
def process_cve(
686686
self,
687687
finding: Finding,
688688
) -> Finding:
689-
"""
690-
Parse the `unsaved_vulnerability_ids` field from findings after they are parsed
691-
to create `Vulnerability_Id` objects with the finding associated correctly
692-
"""
689+
"""Ensure cve is set from the unsaved_vulnerability_ids field, or vice versa."""
693690
# Synchronize the cve field with the unsaved_vulnerability_ids
694691
# We do this to be as flexible as possible to handle the fields until
695692
# the cve field is not needed anymore and can be removed.
@@ -703,6 +700,16 @@ def process_vulnerability_ids(
703700
# If there is no list, make one with the value of the cve field
704701
finding.unsaved_vulnerability_ids = [finding.cve]
705702

703+
return finding
704+
705+
def process_vulnerability_ids(
706+
self,
707+
finding: Finding,
708+
) -> Finding:
709+
"""
710+
Parse the `unsaved_vulnerability_ids` field from findings after they are parsed
711+
to create `Vulnerability_Id` objects with the finding associated correctly
712+
"""
706713
if finding.unsaved_vulnerability_ids:
707714
# Remove old vulnerability ids - keeping this call only because of flake8
708715
Vulnerability_Id.objects.filter(finding=finding).delete()

dojo/importers/default_importer.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def process_findings(
183183
unsaved_finding.reporter = self.user
184184
unsaved_finding.last_reviewed_by = self.user
185185
unsaved_finding.last_reviewed = self.now
186-
logger.debug("process_parsed_findings: active from report: %s, verified from report: %s", unsaved_finding.active, unsaved_finding.verified)
186+
logger.debug("process_parsed_findings: unique_id_from_tool: %s, hash_code: %s, active from report: %s, verified from report: %s", unsaved_finding.unique_id_from_tool, unsaved_finding.hash_code, unsaved_finding.active, unsaved_finding.verified)
187187
# indicates an override. Otherwise, do not change the value of unsaved_finding.active
188188
if self.active is not None:
189189
unsaved_finding.active = self.active
@@ -198,8 +198,13 @@ def process_findings(
198198

199199
# Force parsers to use unsaved_tags (stored in below after saving)
200200
unsaved_finding.tags = None
201-
# postprocessing will be done on next save.
201+
finding = self.process_cve(unsaved_finding)
202+
# Calculate hash_code before saving based on unsaved_endpoints and unsaved_vulnerability_ids
203+
finding.set_hash_code(True)
204+
205+
# postprocessing will be done after processing related fields like endpoints, vulnerability ids, etc.
202206
unsaved_finding.save_no_options()
207+
203208
finding = unsaved_finding
204209
# Determine how the finding should be grouped
205210
self.process_finding_groups(
@@ -218,11 +223,11 @@ def process_findings(
218223
finding = self.process_vulnerability_ids(finding)
219224
# Categorize this finding as a new one
220225
new_findings.append(finding)
226+
# all data is already saved on the finding, we only need to trigger post processing
227+
221228
# to avoid pushing a finding group multiple times, we push those outside of the loop
222-
if self.findings_groups_enabled and self.group_by:
223-
finding.save()
224-
else:
225-
finding.save(push_to_jira=self.push_to_jira)
229+
push_to_jira = self.push_to_jira and (not self.findings_groups_enabled or not self.group_by)
230+
finding_helper.post_process_finding_save(finding, dedupe_option=True, rules_option=True, product_grading_option=True, issue_updater_option=True, push_to_jira=push_to_jira)
226231

227232
for (group_name, findings) in group_names_to_findings_dict.items():
228233
finding_helper.add_findings_to_auto_group(

dojo/importers/default_reimporter.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -236,12 +236,11 @@ def process_findings(
236236
finding,
237237
unsaved_finding,
238238
)
239-
# finding = new finding or existing finding still in the upload report
239+
# all data is already saved on the finding, we only need to trigger post processing
240+
240241
# to avoid pushing a finding group multiple times, we push those outside of the loop
241-
if self.findings_groups_enabled and self.group_by:
242-
finding.save()
243-
else:
244-
finding.save(push_to_jira=self.push_to_jira)
242+
push_to_jira = self.push_to_jira and (not self.findings_groups_enabled or not self.group_by)
243+
finding_helper.post_process_finding_save(finding, dedupe_option=True, rules_option=True, product_grading_option=True, issue_updater_option=True, push_to_jira=push_to_jira)
245244

246245
self.to_mitigate = (set(self.original_items) - set(self.reactivated_items) - set(self.unchanged_items))
247246
# due to #3958 we can have duplicates inside the same report
@@ -336,11 +335,14 @@ def match_new_finding_to_existing_finding(
336335
hash_code=unsaved_finding.hash_code,
337336
).exclude(hash_code=None).order_by("id")
338337
if self.deduplication_algorithm == "unique_id_from_tool":
338+
deduplicationLogger.debug(f"unique_id_from_tool: {unsaved_finding.unique_id_from_tool}")
339339
return Finding.objects.filter(
340340
test=self.test,
341341
unique_id_from_tool=unsaved_finding.unique_id_from_tool,
342342
).exclude(unique_id_from_tool=None).order_by("id")
343343
if self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
344+
deduplicationLogger.debug(f"unique_id_from_tool: {unsaved_finding.unique_id_from_tool}")
345+
deduplicationLogger.debug(f"hash_code: {unsaved_finding.hash_code}")
344346
query = Finding.objects.filter(
345347
Q(test=self.test),
346348
(Q(hash_code__isnull=False) & Q(hash_code=unsaved_finding.hash_code))
@@ -500,10 +502,13 @@ def process_matched_mitigated_finding(
500502
if existing_finding.get_sla_configuration().restart_sla_on_reactivation:
501503
# restart the sla start date to the current date, finding.save() will set new sla_expiration_date
502504
existing_finding.sla_start_date = self.now
505+
existing_finding = self.process_cve(existing_finding)
506+
if existing_finding.get_sla_configuration().restart_sla_on_reactivation:
507+
# restart the sla start date to the current date, finding.save() will set new sla_expiration_date
508+
existing_finding.sla_start_date = self.now
509+
# don't dedupe before endpoints are added, postprocessing will be done on next save (in calling method)
510+
existing_finding.save_no_options()
503511

504-
existing_finding.save(dedupe_option=False)
505-
# don't dedupe before endpoints are added
506-
existing_finding.save(dedupe_option=False)
507512
note = Notes(entry=f"Re-activated by {self.scan_type} re-upload.", author=self.user)
508513
note.save()
509514
endpoint_statuses = existing_finding.status_finding.exclude(
@@ -551,6 +556,9 @@ def process_matched_active_finding(
551556
existing_finding.active = False
552557
if self.verified is not None:
553558
existing_finding.verified = self.verified
559+
existing_finding = self.process_cve(existing_finding)
560+
existing_finding.save_no_options()
561+
554562
elif unsaved_finding.risk_accepted or unsaved_finding.false_p or unsaved_finding.out_of_scope:
555563
logger.debug("Reimported mitigated item matches a finding that is currently open, closing.")
556564
logger.debug(
@@ -563,6 +571,8 @@ def process_matched_active_finding(
563571
existing_finding.active = False
564572
if self.verified is not None:
565573
existing_finding.verified = self.verified
574+
existing_finding = self.process_cve(existing_finding)
575+
existing_finding.save_no_options()
566576
else:
567577
# if finding is the same but list of affected was changed, finding is marked as unchanged. This is a known issue
568578
self.unchanged_items.append(existing_finding)
@@ -597,6 +607,8 @@ def process_finding_that_was_not_matched(
597607
# scan_date was provided, override value from parser
598608
if self.scan_date_override:
599609
unsaved_finding.date = self.scan_date.date()
610+
unsaved_finding = self.process_cve(unsaved_finding)
611+
# Hash code is already calculated earlier as it's the primary matching criteria for reimport
600612
# Save it. Don't dedupe before endpoints are added.
601613
unsaved_finding.save_no_options()
602614
finding = unsaved_finding
@@ -640,7 +652,7 @@ def finding_post_processing(
640652
# Process vulnerability IDs
641653
if finding_from_report.unsaved_vulnerability_ids:
642654
finding.unsaved_vulnerability_ids = finding_from_report.unsaved_vulnerability_ids
643-
655+
# legacy cve field has already been processed/set earlier
644656
return self.process_vulnerability_ids(finding)
645657

646658
def process_groups_for_all_findings(

unittests/test_import_reimport.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,8 @@ def test_import_veracode_reimport_veracode_same_hash_code_different_unique_id(se
658658
def test_import_veracode_reimport_veracode_same_unique_id_different_hash_code(self):
659659
logger.debug("reimporting report with one finding having same unique_id_from_tool but different hash_code, verified=False")
660660

661-
import_veracode_many_findings = self.import_scan_with_params(self.veracode_many_findings, scan_type=self.scan_type_veracode)
661+
with assertTestImportModelsCreated(self, imports=1, created=4, affected_findings=4, closed=0, reactivated=0, untouched=0):
662+
import_veracode_many_findings = self.import_scan_with_params(self.veracode_many_findings, scan_type=self.scan_type_veracode)
662663

663664
test_id = import_veracode_many_findings["test"]
664665

@@ -1748,6 +1749,9 @@ def __init__(self, *args, **kwargs):
17481749

17491750
def setUp(self):
17501751
testuser = User.objects.get(username="admin")
1752+
testuser.usercontactinfo.block_execution = True
1753+
testuser.usercontactinfo.save()
1754+
17511755
token = Token.objects.get(user=testuser)
17521756
self.client = APIClient()
17531757
self.client.credentials(HTTP_AUTHORIZATION="Token " + token.key)
@@ -2023,6 +2027,9 @@ def __init__(self, *args, **kwargs):
20232027
def setUp(self):
20242028
# still using the API to verify results
20252029
testuser = User.objects.get(username="admin")
2030+
testuser.usercontactinfo.block_execution = True
2031+
testuser.usercontactinfo.save()
2032+
20262033
token = Token.objects.get(user=testuser)
20272034
self.client = APIClient()
20282035
self.client.credentials(HTTP_AUTHORIZATION="Token " + token.key)

unittests/test_importers_performance.py

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import logging
22
from contextlib import contextmanager
3-
from unittest.mock import patch
43

54
from crum import impersonate
65
from django.contrib.contenttypes.models import ContentType
@@ -20,12 +19,15 @@
2019
Product_Type,
2120
Test,
2221
User,
22+
UserContactInfo,
2323
)
2424

2525
from .dojo_test_case import DojoTestCase, get_unit_tests_scans_path
2626

27+
logging.basicConfig(level=logging.DEBUG)
2728
logger = logging.getLogger(__name__)
2829

30+
2931
STACK_HAWK_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings.json"
3032
STACK_HAWK_SUBSET_FILENAME = get_unit_tests_scans_path("stackhawk") / "stackhawk_many_vul_without_duplicated_findings_subset.json"
3133
STACK_HAWK_SCAN_TYPE = "StackHawk HawkScan"
@@ -39,6 +41,9 @@ class TestDojoImporterPerformance(DojoTestCase):
3941
def setUp(self):
4042
super().setUp()
4143

44+
testuser = User.objects.create(username="admin")
45+
UserContactInfo.objects.create(user=testuser, block_execution=False)
46+
4247
self.system_settings(enable_webhooks_notifications=False)
4348
self.system_settings(enable_product_grade=False)
4449
self.system_settings(enable_github=False)
@@ -67,6 +72,14 @@ def assertNumAsyncTask(self, num):
6772
)
6873
raise self.failureException(msg)
6974

75+
tasks = dojo_async_task_counter.get_tasks()
76+
tasks_str = "\n".join(str(task) for task in tasks)
77+
msg = (
78+
f"Correct number of {num} celery tasks were created.\n"
79+
f"Tasks created:\n{tasks_str}"
80+
)
81+
logger.debug(msg)
82+
7083
def import_reimport_performance(self, expected_num_queries1, expected_num_async_tasks1, expected_num_queries2, expected_num_async_tasks2, expected_num_queries3, expected_num_async_tasks3):
7184
"""
7285
Log output can be quite large as when the assertNumQueries fails, all queries are printed.
@@ -158,53 +171,61 @@ def import_reimport_performance(self, expected_num_queries1, expected_num_async_
158171
reimporter = DefaultReImporter(**reimport_options)
159172
test, _, _len_new_findings, _len_closed_findings, _, _, _ = reimporter.process_scan(scan)
160173

161-
def test_import_reimport_reimport_performance(self):
174+
# patch the we_want_async decorator to always return True so we don't depend on block_execution flag shenanigans
175+
# @patch("dojo.decorators.we_want_async", return_value=True)
176+
# def test_import_reimport_reimport_performance_async(self, mock):
177+
def test_import_reimport_reimport_performance_async(self):
162178
self.import_reimport_performance(
163-
expected_num_queries1=712,
179+
expected_num_queries1=682,
164180
expected_num_async_tasks1=10,
165-
expected_num_queries2=656,
181+
expected_num_queries2=610,
166182
expected_num_async_tasks2=22,
167-
expected_num_queries3=332,
183+
expected_num_queries3=292,
168184
expected_num_async_tasks3=20,
169185
)
170186

171-
@patch("dojo.decorators.we_want_async", return_value=False)
172-
def test_import_reimport_reimport_performance_no_async(self, mock):
187+
# @patch("dojo.decorators.we_want_async", return_value=False)
188+
# def test_import_reimport_reimport_performance_no_async(self, mock):
189+
def test_import_reimport_reimport_performance_no_async(self):
173190
"""
174191
This test checks the performance of the importers when they are run in sync mode.
175192
The reason for this is that we also want to be aware of when a PR affects the number of queries
176193
or async tasks created by a background task.
177194
The impersonate context manager above does not work as expected for disabling async,
178195
so we patch the we_want_async decorator to always return False.
179196
"""
197+
testuser = User.objects.get(username="admin")
198+
testuser.usercontactinfo.block_execution = True
199+
testuser.usercontactinfo.save()
180200
self.import_reimport_performance(
181-
expected_num_queries1=712,
201+
expected_num_queries1=682,
182202
expected_num_async_tasks1=10,
183-
expected_num_queries2=656,
203+
expected_num_queries2=615,
184204
expected_num_async_tasks2=22,
185-
expected_num_queries3=332,
205+
expected_num_queries3=297,
186206
expected_num_async_tasks3=20,
187207
)
188208

189-
@patch("dojo.decorators.we_want_async", return_value=False)
190-
def test_import_reimport_reimport_performance_no_async_with_product_grading(self, mock):
209+
# @patch("dojo.decorators.we_want_async", return_value=False)
210+
# def test_import_reimport_reimport_performance_no_async_with_product_grading(self, mock):
211+
def test_import_reimport_reimport_performance_no_async_with_product_grading(self):
191212
"""
192213
This test checks the performance of the importers when they are run in sync mode.
193214
The reason for this is that we also want to be aware of when a PR affects the number of queries
194215
or async tasks created by a background task.
195216
The impersonate context manager above does not work as expected for disabling async,
196217
so we patch the we_want_async decorator to always return False.
197218
"""
219+
testuser = User.objects.get(username="admin")
220+
testuser.usercontactinfo.block_execution = True
221+
testuser.usercontactinfo.save()
198222
self.system_settings(enable_product_grade=True)
199-
# Refresh the cache with the new settings
200-
from dojo.middleware import DojoSytemSettingsMiddleware
201-
DojoSytemSettingsMiddleware.load()
202223

203224
self.import_reimport_performance(
204-
expected_num_queries1=732,
225+
expected_num_queries1=702,
205226
expected_num_async_tasks1=15,
206-
expected_num_queries2=686,
227+
expected_num_queries2=645,
207228
expected_num_async_tasks2=28,
208-
expected_num_queries3=357,
229+
expected_num_queries3=322,
209230
expected_num_async_tasks3=25,
210231
)

0 commit comments

Comments
 (0)