Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions docs/content/en/open_source/upgrading/2.52.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ weight: -20251006
description: MobSF parsers & Helm chart changes.
---

## Deduplication fix of `UNIQUE_ID_OR_HASH_CODE`
A bug was fixed in the `UNIQUE_ID_OR_HASH_CODE` algorithm where it stopped processing candidate findings with equal `unique_id_from_tool` or `hash_code` value.
Strictly speaking this is not a breaking change, but we wanted to make you aware that you can see more (better) more deduplicatation for parsers using this algorithm.

## Merge of MobSF parsers

Mobsfscan Scan" has been merged into the "MobSF Scan" parser. The "Mobsfscan Scan" scan_type has been retained to keep deduplication working for existing Tests, but users are encouraged to move to the "MobSF Scan" scan_type.
Expand All @@ -17,16 +21,16 @@ This release introduces more important changes to the Helm chart configuration:

#### Tags

`tag` and `repositoryPrefix` fields have been deprecated. Currently, image tags used in containers are derived by default from the `appVersion` defined in the Chart.
This behavior can be overridden by setting the `tag` value in `images.django` and `images.nginx`.
If fine-tuning is necessary, each container’s image value can also be customized individually (`celery.beat.image`, `celery.worker.image`, `django.nginx.image`, `django.uwsgi.image`, `initializer.image`, and `dbMigrationChecker.image`).
`tag` and `repositoryPrefix` fields have been deprecated. Currently, image tags used in containers are derived by default from the `appVersion` defined in the Chart.
This behavior can be overridden by setting the `tag` value in `images.django` and `images.nginx`.
If fine-tuning is necessary, each container’s image value can also be customized individually (`celery.beat.image`, `celery.worker.image`, `django.nginx.image`, `django.uwsgi.image`, `initializer.image`, and `dbMigrationChecker.image`).
Digest pinning is now supported as well.

#### Security context

This Helm chart extends security context capabilities to all deployed pods and containers.
You can define a default pod and container security context globally using `securityContext.podSecurityContext` and `securityContext.containerSecurityContext` keys.
Additionally, each deployment can specify its own pod and container security contexts, which will override or merge with the global ones.
Additionally, each deployment can specify its own pod and container security contexts, which will override or merge with the global ones.

#### Fine-grained resources

Expand Down
4 changes: 2 additions & 2 deletions dojo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ def deduplicate_uid_or_hash_code(new_finding):
id=new_finding.id).exclude(
duplicate=True).order_by("id")
deduplicationLogger.debug("Found "
+ str(len(existing_findings)) + " findings with either the same unique_id_from_tool or hash_code")
+ str(len(existing_findings)) + " findings with either the same unique_id_from_tool or hash_code: " + str([find.id for find in existing_findings]))
for find in existing_findings:
if is_deduplication_on_engagement_mismatch(new_finding, find):
deduplicationLogger.debug(
Expand All @@ -517,10 +517,10 @@ def deduplicate_uid_or_hash_code(new_finding):
try:
if are_endpoints_duplicates(new_finding, find):
set_duplicate(new_finding, find)
break
except Exception as e:
deduplicationLogger.debug(str(e))
continue
break


def set_duplicate(new_finding, existing_finding):
Expand Down
133 changes: 107 additions & 26 deletions unittests/test_deduplication_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1181,55 +1181,136 @@ def test_dedupe_same_id_different_test_type_unique_id_or_hash_code(self):
# expect not duplicate as the mathcing finding is from another test_type, hash_code is also different
self.assert_finding(finding_new, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)

def test_identical_different_endpoints_unique_id_or_hash_code(self):
def test_identical_different_endpoints_unique_id_or_hash_code_dynamic(self):
# create identical copy, so unique id is the same
finding_new, finding_224 = self.copy_and_reset_finding(find_id=224)
finding_new1, finding_224 = self.copy_and_reset_finding(find_id=224)

finding_new.save(dedupe_option=False)
ep1 = Endpoint(product=finding_new.test.engagement.product, finding=finding_new, host="myhost.com", protocol="https")
finding_new1.save(dedupe_option=False)
ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new1, host="myhost.com", protocol="https")
ep1.save()
finding_new.endpoints.add(ep1)
finding_new.save()
finding_new1.endpoints.add(ep1)
finding_new1.save()

if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
# expect duplicate, as endpoints shouldn't affect dedupe and hash_code due to unique_id
self.assert_finding(finding_new, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
self.assert_finding(finding_new1, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
else:
self.assert_finding(finding_new, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)
# endpoints don't match with 224, so not a duplicate
self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)

# remove the finding to prevent it from being duplicated by the next finding we create
finding_new1.delete()

# same scenario, now with different uid. and different endpoints, but hash will be different due the endpoints because we set dynamic_finding to True
finding_new, finding_224 = self.copy_and_reset_finding(find_id=224)
finding_new2, finding_224 = self.copy_and_reset_finding(find_id=224)

finding_new.save(dedupe_option=False)
ep1 = Endpoint(product=finding_new.test.engagement.product, finding=finding_new, host="myhost.com", protocol="https")
finding_new2.save(dedupe_option=False)
ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new2, host="myhost.com", protocol="https")
ep1.save()
finding_new.endpoints.add(ep1)
finding_new.unique_id_from_tool = 1
finding_new.dynamic_finding = True
finding_new.save()
finding_new2.endpoints.add(ep1)
finding_new2.unique_id_from_tool = 1
finding_new2.dynamic_finding = True
finding_new2.save()

if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
# different uid. and different endpoints, but endpoints not used for hash anymore -> duplicate
self.assert_finding(finding_new, not_pk=224, duplicate=True, hash_code=finding_224.hash_code)
self.assert_finding(finding_new2, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
else:
# endpoints do not match with 224
self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)

def test_identical_different_endpoints_unique_id_or_hash_code_static(self):
# create identical copy, so unique id is the same
finding_new1, finding_224 = self.copy_and_reset_finding(find_id=224)

finding_new1.save(dedupe_option=False)
ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new1, host="myhost.com", protocol="https")
ep1.save()
finding_new1.endpoints.add(ep1)
finding_new1.save()

if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
# expect duplicate, as endpoints shouldn't affect dedupe and hash_code due to unique_id
self.assert_finding(finding_new1, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
else:
self.assert_finding(finding_new, not_pk=224, duplicate=False, hash_code=finding_224.hash_code)
# endpoints don't match with 224, so not a duplicate
self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)

# remove the finding to prevent it from being duplicated by the next finding we create
finding_new1.delete()

# same scenario, now with different uid. and different endpoints
finding_new, finding_224 = self.copy_and_reset_finding(find_id=224)
finding_new3, finding_224 = self.copy_and_reset_finding(find_id=224)

finding_new.save(dedupe_option=False)
ep1 = Endpoint(product=finding_new.test.engagement.product, finding=finding_new, host="myhost.com", protocol="https")
finding_new3.save(dedupe_option=False)
ep1 = Endpoint(product=finding_new3.test.engagement.product, finding=finding_new3, host="myhost.com", protocol="https")
ep1.save()
finding_new.endpoints.add(ep1)
finding_new.unique_id_from_tool = 1
finding_new.dynamic_finding = False
finding_new.save()
finding_new3.endpoints.add(ep1)
finding_new3.unique_id_from_tool = 1
finding_new3.dynamic_finding = False
finding_new3.save()

if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
# different uid. and different endpoints, dynamic_finding is set to False hash_code still not affected by endpoints
self.assert_finding(finding_new3, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
else:
# endpoints do not match with 224
self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)

def test_identical_different_endpoints_unique_id_or_hash_code_multiple(self):
# create identical copy, so unique id is the same
finding_new1, finding_224 = self.copy_and_reset_finding(find_id=224)

finding_new1.save(dedupe_option=False)
ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new1, host="myhost.com", protocol="https")
ep1.save()
finding_new1.endpoints.add(ep1)
finding_new1.save()

if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
# expect duplicate, as endpoints shouldn't affect dedupe and hash_code due to unique_id
self.assert_finding(finding_new1, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
else:
# endpoints don't match with 224, so not a duplicate
self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)

# same scenario, now with different uid. and different endpoints, but hash will be different due the endpoints because we set dynamic_finding to True
finding_new2, finding_224 = self.copy_and_reset_finding(find_id=224)

finding_new2.save(dedupe_option=False)
ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new2, host="myhost.com", protocol="https")
ep1.save()
finding_new2.endpoints.add(ep1)
finding_new2.unique_id_from_tool = 1
finding_new2.dynamic_finding = True
finding_new2.save()

if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
# different uid. and different endpoints, but endpoints not used for hash anymore -> duplicate
self.assert_finding(finding_new2, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
else:
# endpoints do not match with 224, but they do match with the finding just created. this proves that the dedupe algo considers more than only the first
# candidate https://github.com/DefectDojo/django-DefectDojo/issues/13497
self.assert_finding(finding_new2, not_pk=224, duplicate=True, duplicate_finding_id=finding_new1.pk, hash_code=finding_224.hash_code)

# same scenario, now with different uid. and different endpoints
finding_new3, finding_224 = self.copy_and_reset_finding(find_id=224)

finding_new3.save(dedupe_option=False)
ep1 = Endpoint(product=finding_new3.test.engagement.product, finding=finding_new3, host="myhost.com", protocol="https")
ep1.save()
finding_new3.endpoints.add(ep1)
finding_new3.unique_id_from_tool = 1
finding_new3.dynamic_finding = False
finding_new3.save()

if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []:
# different uid. and different endpoints, dynamic_finding is set to False hash_code still not affected by endpoints
self.assert_finding(finding_new, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
self.assert_finding(finding_new3, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code)
else:
self.assert_finding(finding_new, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code)
# endpoints do not match with 224, but they do match with the finding just created. this proves that the dedupe algo considers more than only the first
# candidate https://github.com/DefectDojo/django-DefectDojo/issues/13497
self.assert_finding(finding_new3, not_pk=224, duplicate=True, duplicate_finding_id=finding_new1.pk, hash_code=finding_224.hash_code)

# # some extra tests

Expand Down