diff --git a/docs/content/en/open_source/upgrading/2.52.md b/docs/content/en/open_source/upgrading/2.52.md index c9f6b38418f..c15bad237f8 100644 --- a/docs/content/en/open_source/upgrading/2.52.md +++ b/docs/content/en/open_source/upgrading/2.52.md @@ -5,6 +5,10 @@ weight: -20251006 description: MobSF parsers & Helm chart changes. --- +## Deduplication fix of `UNIQUE_ID_OR_HASH_CODE` +A bug was fixed in the `UNIQUE_ID_OR_HASH_CODE` algorithm where it stopped processing candidate findings with equal `unique_id_from_tool` or `hash_code` value. +Strictly speaking this is not a breaking change, but we wanted to make you aware that you can see more (better) more deduplicatation for parsers using this algorithm. + ## Merge of MobSF parsers Mobsfscan Scan" has been merged into the "MobSF Scan" parser. The "Mobsfscan Scan" scan_type has been retained to keep deduplication working for existing Tests, but users are encouraged to move to the "MobSF Scan" scan_type. @@ -17,16 +21,16 @@ This release introduces more important changes to the Helm chart configuration: #### Tags -`tag` and `repositoryPrefix` fields have been deprecated. Currently, image tags used in containers are derived by default from the `appVersion` defined in the Chart. -This behavior can be overridden by setting the `tag` value in `images.django` and `images.nginx`. -If fine-tuning is necessary, each container’s image value can also be customized individually (`celery.beat.image`, `celery.worker.image`, `django.nginx.image`, `django.uwsgi.image`, `initializer.image`, and `dbMigrationChecker.image`). +`tag` and `repositoryPrefix` fields have been deprecated. Currently, image tags used in containers are derived by default from the `appVersion` defined in the Chart. +This behavior can be overridden by setting the `tag` value in `images.django` and `images.nginx`. +If fine-tuning is necessary, each container’s image value can also be customized individually (`celery.beat.image`, `celery.worker.image`, `django.nginx.image`, `django.uwsgi.image`, `initializer.image`, and `dbMigrationChecker.image`). Digest pinning is now supported as well. #### Security context This Helm chart extends security context capabilities to all deployed pods and containers. You can define a default pod and container security context globally using `securityContext.podSecurityContext` and `securityContext.containerSecurityContext` keys. -Additionally, each deployment can specify its own pod and container security contexts, which will override or merge with the global ones. +Additionally, each deployment can specify its own pod and container security contexts, which will override or merge with the global ones. #### Fine-grained resources diff --git a/dojo/utils.py b/dojo/utils.py index 07709c4bbbf..7469ee0ffa5 100644 --- a/dojo/utils.py +++ b/dojo/utils.py @@ -508,7 +508,7 @@ def deduplicate_uid_or_hash_code(new_finding): id=new_finding.id).exclude( duplicate=True).order_by("id") deduplicationLogger.debug("Found " - + str(len(existing_findings)) + " findings with either the same unique_id_from_tool or hash_code") + + str(len(existing_findings)) + " findings with either the same unique_id_from_tool or hash_code: " + str([find.id for find in existing_findings])) for find in existing_findings: if is_deduplication_on_engagement_mismatch(new_finding, find): deduplicationLogger.debug( @@ -517,10 +517,10 @@ def deduplicate_uid_or_hash_code(new_finding): try: if are_endpoints_duplicates(new_finding, find): set_duplicate(new_finding, find) + break except Exception as e: deduplicationLogger.debug(str(e)) continue - break def set_duplicate(new_finding, existing_finding): diff --git a/unittests/test_deduplication_logic.py b/unittests/test_deduplication_logic.py index c9e8e26e53d..c7683a23b46 100644 --- a/unittests/test_deduplication_logic.py +++ b/unittests/test_deduplication_logic.py @@ -1181,55 +1181,136 @@ def test_dedupe_same_id_different_test_type_unique_id_or_hash_code(self): # expect not duplicate as the mathcing finding is from another test_type, hash_code is also different self.assert_finding(finding_new, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code) - def test_identical_different_endpoints_unique_id_or_hash_code(self): + def test_identical_different_endpoints_unique_id_or_hash_code_dynamic(self): # create identical copy, so unique id is the same - finding_new, finding_224 = self.copy_and_reset_finding(find_id=224) + finding_new1, finding_224 = self.copy_and_reset_finding(find_id=224) - finding_new.save(dedupe_option=False) - ep1 = Endpoint(product=finding_new.test.engagement.product, finding=finding_new, host="myhost.com", protocol="https") + finding_new1.save(dedupe_option=False) + ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new1, host="myhost.com", protocol="https") ep1.save() - finding_new.endpoints.add(ep1) - finding_new.save() + finding_new1.endpoints.add(ep1) + finding_new1.save() if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []: # expect duplicate, as endpoints shouldn't affect dedupe and hash_code due to unique_id - self.assert_finding(finding_new, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code) + self.assert_finding(finding_new1, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code) else: - self.assert_finding(finding_new, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code) + # endpoints don't match with 224, so not a duplicate + self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code) + + # remove the finding to prevent it from being duplicated by the next finding we create + finding_new1.delete() # same scenario, now with different uid. and different endpoints, but hash will be different due the endpoints because we set dynamic_finding to True - finding_new, finding_224 = self.copy_and_reset_finding(find_id=224) + finding_new2, finding_224 = self.copy_and_reset_finding(find_id=224) - finding_new.save(dedupe_option=False) - ep1 = Endpoint(product=finding_new.test.engagement.product, finding=finding_new, host="myhost.com", protocol="https") + finding_new2.save(dedupe_option=False) + ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new2, host="myhost.com", protocol="https") ep1.save() - finding_new.endpoints.add(ep1) - finding_new.unique_id_from_tool = 1 - finding_new.dynamic_finding = True - finding_new.save() + finding_new2.endpoints.add(ep1) + finding_new2.unique_id_from_tool = 1 + finding_new2.dynamic_finding = True + finding_new2.save() if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []: # different uid. and different endpoints, but endpoints not used for hash anymore -> duplicate - self.assert_finding(finding_new, not_pk=224, duplicate=True, hash_code=finding_224.hash_code) + self.assert_finding(finding_new2, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code) + else: + # endpoints do not match with 224 + self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code) + + def test_identical_different_endpoints_unique_id_or_hash_code_static(self): + # create identical copy, so unique id is the same + finding_new1, finding_224 = self.copy_and_reset_finding(find_id=224) + + finding_new1.save(dedupe_option=False) + ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new1, host="myhost.com", protocol="https") + ep1.save() + finding_new1.endpoints.add(ep1) + finding_new1.save() + + if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []: + # expect duplicate, as endpoints shouldn't affect dedupe and hash_code due to unique_id + self.assert_finding(finding_new1, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code) else: - self.assert_finding(finding_new, not_pk=224, duplicate=False, hash_code=finding_224.hash_code) + # endpoints don't match with 224, so not a duplicate + self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code) + + # remove the finding to prevent it from being duplicated by the next finding we create + finding_new1.delete() # same scenario, now with different uid. and different endpoints - finding_new, finding_224 = self.copy_and_reset_finding(find_id=224) + finding_new3, finding_224 = self.copy_and_reset_finding(find_id=224) - finding_new.save(dedupe_option=False) - ep1 = Endpoint(product=finding_new.test.engagement.product, finding=finding_new, host="myhost.com", protocol="https") + finding_new3.save(dedupe_option=False) + ep1 = Endpoint(product=finding_new3.test.engagement.product, finding=finding_new3, host="myhost.com", protocol="https") ep1.save() - finding_new.endpoints.add(ep1) - finding_new.unique_id_from_tool = 1 - finding_new.dynamic_finding = False - finding_new.save() + finding_new3.endpoints.add(ep1) + finding_new3.unique_id_from_tool = 1 + finding_new3.dynamic_finding = False + finding_new3.save() + + if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []: + # different uid. and different endpoints, dynamic_finding is set to False hash_code still not affected by endpoints + self.assert_finding(finding_new3, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code) + else: + # endpoints do not match with 224 + self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code) + + def test_identical_different_endpoints_unique_id_or_hash_code_multiple(self): + # create identical copy, so unique id is the same + finding_new1, finding_224 = self.copy_and_reset_finding(find_id=224) + + finding_new1.save(dedupe_option=False) + ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new1, host="myhost.com", protocol="https") + ep1.save() + finding_new1.endpoints.add(ep1) + finding_new1.save() + + if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []: + # expect duplicate, as endpoints shouldn't affect dedupe and hash_code due to unique_id + self.assert_finding(finding_new1, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code) + else: + # endpoints don't match with 224, so not a duplicate + self.assert_finding(finding_new1, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code) + + # same scenario, now with different uid. and different endpoints, but hash will be different due the endpoints because we set dynamic_finding to True + finding_new2, finding_224 = self.copy_and_reset_finding(find_id=224) + + finding_new2.save(dedupe_option=False) + ep1 = Endpoint(product=finding_new1.test.engagement.product, finding=finding_new2, host="myhost.com", protocol="https") + ep1.save() + finding_new2.endpoints.add(ep1) + finding_new2.unique_id_from_tool = 1 + finding_new2.dynamic_finding = True + finding_new2.save() + + if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []: + # different uid. and different endpoints, but endpoints not used for hash anymore -> duplicate + self.assert_finding(finding_new2, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code) + else: + # endpoints do not match with 224, but they do match with the finding just created. this proves that the dedupe algo considers more than only the first + # candidate https://github.com/DefectDojo/django-DefectDojo/issues/13497 + self.assert_finding(finding_new2, not_pk=224, duplicate=True, duplicate_finding_id=finding_new1.pk, hash_code=finding_224.hash_code) + + # same scenario, now with different uid. and different endpoints + finding_new3, finding_224 = self.copy_and_reset_finding(find_id=224) + + finding_new3.save(dedupe_option=False) + ep1 = Endpoint(product=finding_new3.test.engagement.product, finding=finding_new3, host="myhost.com", protocol="https") + ep1.save() + finding_new3.endpoints.add(ep1) + finding_new3.unique_id_from_tool = 1 + finding_new3.dynamic_finding = False + finding_new3.save() if settings.DEDUPE_ALGO_ENDPOINT_FIELDS == []: # different uid. and different endpoints, dynamic_finding is set to False hash_code still not affected by endpoints - self.assert_finding(finding_new, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code) + self.assert_finding(finding_new3, not_pk=224, duplicate=True, duplicate_finding_id=224, hash_code=finding_224.hash_code) else: - self.assert_finding(finding_new, not_pk=224, duplicate=False, duplicate_finding_id=None, hash_code=finding_224.hash_code) + # endpoints do not match with 224, but they do match with the finding just created. this proves that the dedupe algo considers more than only the first + # candidate https://github.com/DefectDojo/django-DefectDojo/issues/13497 + self.assert_finding(finding_new3, not_pk=224, duplicate=True, duplicate_finding_id=finding_new1.pk, hash_code=finding_224.hash_code) # # some extra tests