Skip to content

Commit 49d020d

Browse files
committed
rework url cleaning
1 parent d32f9b6 commit 49d020d

3 files changed

Lines changed: 61 additions & 55 deletions

File tree

dojo/management/commands/migrate_endpoints_to_locations.py

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
import datetime
22
import logging
33

4-
from django.core.exceptions import ValidationError
54
from django.core.management.base import BaseCommand
65
from django.utils import timezone
76

87
from dojo.location.models import Location
98
from dojo.location.status import FindingLocationStatus
109
from dojo.models import DojoMeta, Endpoint, Endpoint_Status
1110
from dojo.url.models import URL
12-
from dojo.url.validators import validate_host_or_ip
1311

1412
logger = logging.getLogger(__name__)
1513

@@ -27,24 +25,17 @@ class Command(BaseCommand):
2725
help = "Usage: manage.py migrate_endpoints_to_locations"
2826

2927
def _endpoint_to_url(self, endpoint: Endpoint) -> Location:
30-
# First determine if we will have a problem with the endpoint
31-
try:
32-
validate_host_or_ip(endpoint.host)
33-
host_validation_failure = False
34-
except ValidationError:
35-
host_validation_failure = True
3628
# Create the raw URL object first
3729
# This should create the location object as well
38-
url = URL.objects.get_or_create(
39-
protocol=(endpoint.protocol or "").lower(),
40-
user_info=endpoint.userinfo or "",
41-
host=(endpoint.host or "").lower(),
30+
url = URL.get_or_create_from_values(
31+
protocol=endpoint.protocol,
32+
user_info=endpoint.userinfo,
33+
host=endpoint.host,
4234
port=endpoint.port,
43-
path=endpoint.path or "",
44-
query=endpoint.query or "",
45-
fragment=endpoint.fragment or "",
46-
host_validation_failure=host_validation_failure,
47-
)[0]
35+
path=endpoint.path,
36+
query=endpoint.query,
37+
fragment=endpoint.fragment,
38+
)
4839
# Add the endpoint tags to the location tags
4940
if endpoint.tags:
5041
[url.location.tags.add(tag) for tag in set(endpoint.tags.values_list("name", flat=True))]

dojo/tools/stackhawk/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def __extract_finding(
7979
+ "\n"
8080
)
8181
if settings.V3_FEATURE_LOCATIONS:
82-
location = URL(host=host, path=path["path"])
82+
location = URL.from_value(host + path["path"])
8383
else:
8484
# TODO: Delete this after the move to Locations
8585
location = Endpoint.from_uri(host + path["path"])

dojo/url/models.py

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
from __future__ import annotations
22

33
import hashlib
4+
import ipaddress
45
from contextlib import suppress
56
from dataclasses import dataclass
67
from urllib.parse import unquote_plus, urlsplit
78

9+
import idna
810
from django.core.exceptions import ValidationError
911
from django.core.validators import MaxValueValidator, MinLengthValidator, MinValueValidator
1012
from django.db import IntegrityError, transaction
@@ -85,17 +87,32 @@ def parse(self, value: str) -> ParsedUrl:
8587
def unparse(self, url: URL) -> str:
8688
# path/query are stored as flat text; parse them with Hyperlink
8789
parsed_path_and_query = HyperlinkURL.from_text(f"{url.path}?{url.query}").normalize()
88-
return HyperlinkURL(
90+
91+
# Hyperlink assumes the host field is a domain name, and explodes when encoding an IP or something that's not
92+
# quite a valid hostname but Dojo allows anyway. Check if it's one of such explosion-causing cases to determine
93+
# whether we should be sneaky and substitute in the hostname manually after the fact.
94+
unparse_host = True
95+
try:
96+
idna.encode(url.host, uts46=True)
97+
except idna.IDNAError:
98+
unparse_host = False
99+
100+
normalized = HyperlinkURL(
89101
scheme=url.protocol,
90102
userinfo=url.user_info,
91-
host=url.host,
103+
host=url.host if unparse_host else "",
92104
port=url.port,
93105
path=parsed_path_and_query.path,
94106
rooted=False,
95107
query=parsed_path_and_query.query,
96108
fragment=url.fragment,
97109
# path not normalized if empty, in line with the way Endpoints worked
98-
).normalize(path=bool(url.path)).to_uri().to_text().removeprefix("//")
110+
).normalize(path=bool(url.path)).to_uri()
111+
112+
if not unparse_host:
113+
normalized = normalized.replace(host=url.host)
114+
115+
return normalized.to_text().removeprefix("//")
99116

100117

101118
class URL(AbstractLocation):
@@ -227,30 +244,21 @@ def get_location_type(cls) -> str:
227244
def get_location_value(self) -> str:
228245
return str(self)[:2048]
229246

230-
def normalize_url_parts(self):
247+
@staticmethod
248+
def _parse_string_value(value: str) -> ParsedUrl:
249+
"""Internal method to parse the string representation of the model"""
250+
return URL.URL_PARSING_CLASS().parse(value)
251+
252+
def clean(self, *args: list, **kwargs: dict) -> None:
253+
"""Validate the input supplied."""
231254
self.clean_protocol()
232255
self.clean_user_info()
233256
self.clean_host()
234257
self.clean_port()
235258
self.clean_path()
236259
self.clean_query()
237260
self.clean_fragment()
238-
self.clean_host_validation_failure()
239261
self.set_db_hash()
240-
241-
def pre_save_logic(self) -> None:
242-
"""Allow for some pre save operations by other classes."""
243-
self.normalize_url_parts()
244-
super().pre_save_logic()
245-
246-
@staticmethod
247-
def _parse_string_value(value: str) -> ParsedUrl:
248-
"""Internal method to parse the string representation of the model"""
249-
return URL.URL_PARSING_CLASS().parse(value)
250-
251-
def clean(self, *args: list, **kwargs: dict) -> None:
252-
"""Validate the input supplied."""
253-
self.normalize_url_parts()
254262
super().clean(*args, **kwargs)
255263

256264
def clean_protocol(self) -> None:
@@ -263,13 +271,24 @@ def clean_user_info(self):
263271
if not self.user_info:
264272
self.user_info = ""
265273
else:
266-
self.user_info = self.remove_null_bytes(self.user_info.strip())
274+
self.user_info = self.replace_null_bytes(self.user_info.strip())
267275

268276
def clean_host(self) -> None:
277+
self.host_validation_failure = False
269278
if not self.host:
270279
self.host = ""
271280
else:
272-
self.host = self.host.lower()
281+
try:
282+
# Check if it's a valid IP address first
283+
self.host = ipaddress.ip_address(self.host).compressed
284+
except ValueError:
285+
try:
286+
# Attempt to depunify the hostname
287+
self.host = idna.encode(self.host, uts46=True).decode("ascii")
288+
except idna.IDNAError:
289+
# Some issue with the hostname exists. We'll store it, but are DEFINITELY making a note of this.
290+
self.host = self.replace_null_bytes(self.host.lower())
291+
self.host_validation_failure = True
273292

274293
def clean_port(self) -> None:
275294
if not bool(self.port):
@@ -286,32 +305,29 @@ def clean_path(self):
286305
if not self.path:
287306
self.path = ""
288307
else:
289-
self.path = self.remove_null_bytes(self.path.strip().removeprefix("/"))
308+
self.path = self.replace_null_bytes(self.path.strip().removeprefix("/"))
290309

291310
def clean_fragment(self) -> None:
292311
if not self.fragment:
293312
self.fragment = ""
294313
else:
295-
self.fragment = self.remove_null_bytes(self.fragment.strip().removeprefix("#"))
314+
self.fragment = self.replace_null_bytes(self.fragment.strip().removeprefix("#"))
296315

297316
def clean_query(self) -> None:
298317
if not self.query:
299318
self.query = ""
300319
else:
301-
self.query = self.remove_null_bytes(self.query.strip().removeprefix("?"))
302-
303-
def clean_host_validation_failure(self):
304-
self.host_validation_failure = bool(self.host_validation_failure)
320+
self.query = self.replace_null_bytes(self.query.strip().removeprefix("?"))
305321

306322
def set_db_hash(self):
307323
self.hash = hashlib.blake2b(str(self).encode(), digest_size=32).hexdigest()
308324

309-
def remove_null_bytes(self, value: str) -> str:
325+
def replace_null_bytes(self, value: str) -> str:
310326
return value.replace("\x00", "%00")
311327

312328
@staticmethod
313329
def get_or_create_from_object(url: URL) -> URL:
314-
url.normalize_url_parts()
330+
url.clean()
315331
with transaction.atomic():
316332
try:
317333
return URL.objects.get_or_create(
@@ -339,18 +355,17 @@ def get_or_create_from_values(
339355
path=None,
340356
query=None,
341357
fragment=None,
342-
host_validation_failure=None,
343358
) -> URL:
344-
return URL.get_or_create_from_object(URL(
359+
url = URL(
345360
protocol=protocol,
346361
user_info=user_info,
347362
host=host,
348363
port=port,
349364
path=path,
350365
query=query,
351366
fragment=fragment,
352-
host_validation_failure=host_validation_failure,
353-
))
367+
)
368+
return URL.get_or_create_from_object(url)
354369

355370
@staticmethod
356371
def create_location_from_value(value: str) -> URL:
@@ -365,9 +380,9 @@ def from_value(value: str) -> URL:
365380

366381
parsed_url = URL._parse_string_value(value)
367382

368-
path = parsed_url.path.lstrip("/")[:2048]
369-
query = parsed_url.query[:2048]
370-
fragment = parsed_url.fragment[:2048]
383+
path = parsed_url.path.removeprefix("/")[:2048]
384+
query = parsed_url.query.removeprefix("?")[:2048]
385+
fragment = parsed_url.fragment.removeprefix("#")[:2048]
371386

372387
# Create the initial object, assuming no exceptions are thrown
373388
url = URL(
@@ -379,5 +394,5 @@ def from_value(value: str) -> URL:
379394
query=query,
380395
fragment=fragment,
381396
)
382-
url.normalize_url_parts()
397+
url.clean()
383398
return url

0 commit comments

Comments
 (0)