11from __future__ import annotations
22
33import hashlib
4+ import ipaddress
45from contextlib import suppress
56from dataclasses import dataclass
67from urllib .parse import unquote_plus , urlsplit
78
9+ import idna
810from django .core .exceptions import ValidationError
911from django .core .validators import MaxValueValidator , MinLengthValidator , MinValueValidator
1012from django .db import IntegrityError , transaction
@@ -85,17 +87,32 @@ def parse(self, value: str) -> ParsedUrl:
8587 def unparse (self , url : URL ) -> str :
8688 # path/query are stored as flat text; parse them with Hyperlink
8789 parsed_path_and_query = HyperlinkURL .from_text (f"{ url .path } ?{ url .query } " ).normalize ()
88- return HyperlinkURL (
90+
91+ # Hyperlink assumes the host field is a domain name, and explodes when encoding an IP or something that's not
92+ # quite a valid hostname but Dojo allows anyway. Check if it's one of such explosion-causing cases to determine
93+ # whether we should be sneaky and substitute in the hostname manually after the fact.
94+ unparse_host = True
95+ try :
96+ idna .encode (url .host , uts46 = True )
97+ except idna .IDNAError :
98+ unparse_host = False
99+
100+ normalized = HyperlinkURL (
89101 scheme = url .protocol ,
90102 userinfo = url .user_info ,
91- host = url .host ,
103+ host = url .host if unparse_host else "" ,
92104 port = url .port ,
93105 path = parsed_path_and_query .path ,
94106 rooted = False ,
95107 query = parsed_path_and_query .query ,
96108 fragment = url .fragment ,
97109 # path not normalized if empty, in line with the way Endpoints worked
98- ).normalize (path = bool (url .path )).to_uri ().to_text ().removeprefix ("//" )
110+ ).normalize (path = bool (url .path )).to_uri ()
111+
112+ if not unparse_host :
113+ normalized = normalized .replace (host = url .host )
114+
115+ return normalized .to_text ().removeprefix ("//" )
99116
100117
101118class URL (AbstractLocation ):
@@ -227,30 +244,21 @@ def get_location_type(cls) -> str:
227244 def get_location_value (self ) -> str :
228245 return str (self )[:2048 ]
229246
230- def normalize_url_parts (self ):
247+ @staticmethod
248+ def _parse_string_value (value : str ) -> ParsedUrl :
249+ """Internal method to parse the string representation of the model"""
250+ return URL .URL_PARSING_CLASS ().parse (value )
251+
252+ def clean (self , * args : list , ** kwargs : dict ) -> None :
253+ """Validate the input supplied."""
231254 self .clean_protocol ()
232255 self .clean_user_info ()
233256 self .clean_host ()
234257 self .clean_port ()
235258 self .clean_path ()
236259 self .clean_query ()
237260 self .clean_fragment ()
238- self .clean_host_validation_failure ()
239261 self .set_db_hash ()
240-
241- def pre_save_logic (self ) -> None :
242- """Allow for some pre save operations by other classes."""
243- self .normalize_url_parts ()
244- super ().pre_save_logic ()
245-
246- @staticmethod
247- def _parse_string_value (value : str ) -> ParsedUrl :
248- """Internal method to parse the string representation of the model"""
249- return URL .URL_PARSING_CLASS ().parse (value )
250-
251- def clean (self , * args : list , ** kwargs : dict ) -> None :
252- """Validate the input supplied."""
253- self .normalize_url_parts ()
254262 super ().clean (* args , ** kwargs )
255263
256264 def clean_protocol (self ) -> None :
@@ -263,13 +271,24 @@ def clean_user_info(self):
263271 if not self .user_info :
264272 self .user_info = ""
265273 else :
266- self .user_info = self .remove_null_bytes (self .user_info .strip ())
274+ self .user_info = self .replace_null_bytes (self .user_info .strip ())
267275
268276 def clean_host (self ) -> None :
277+ self .host_validation_failure = False
269278 if not self .host :
270279 self .host = ""
271280 else :
272- self .host = self .host .lower ()
281+ try :
282+ # Check if it's a valid IP address first
283+ self .host = ipaddress .ip_address (self .host ).compressed
284+ except ValueError :
285+ try :
286+ # Attempt to depunify the hostname
287+ self .host = idna .encode (self .host , uts46 = True ).decode ("ascii" )
288+ except idna .IDNAError :
289+ # Some issue with the hostname exists. We'll store it, but are DEFINITELY making a note of this.
290+ self .host = self .replace_null_bytes (self .host .lower ())
291+ self .host_validation_failure = True
273292
274293 def clean_port (self ) -> None :
275294 if not bool (self .port ):
@@ -286,32 +305,29 @@ def clean_path(self):
286305 if not self .path :
287306 self .path = ""
288307 else :
289- self .path = self .remove_null_bytes (self .path .strip ().removeprefix ("/" ))
308+ self .path = self .replace_null_bytes (self .path .strip ().removeprefix ("/" ))
290309
291310 def clean_fragment (self ) -> None :
292311 if not self .fragment :
293312 self .fragment = ""
294313 else :
295- self .fragment = self .remove_null_bytes (self .fragment .strip ().removeprefix ("#" ))
314+ self .fragment = self .replace_null_bytes (self .fragment .strip ().removeprefix ("#" ))
296315
297316 def clean_query (self ) -> None :
298317 if not self .query :
299318 self .query = ""
300319 else :
301- self .query = self .remove_null_bytes (self .query .strip ().removeprefix ("?" ))
302-
303- def clean_host_validation_failure (self ):
304- self .host_validation_failure = bool (self .host_validation_failure )
320+ self .query = self .replace_null_bytes (self .query .strip ().removeprefix ("?" ))
305321
306322 def set_db_hash (self ):
307323 self .hash = hashlib .blake2b (str (self ).encode (), digest_size = 32 ).hexdigest ()
308324
309- def remove_null_bytes (self , value : str ) -> str :
325+ def replace_null_bytes (self , value : str ) -> str :
310326 return value .replace ("\x00 " , "%00" )
311327
312328 @staticmethod
313329 def get_or_create_from_object (url : URL ) -> URL :
314- url .normalize_url_parts ()
330+ url .clean ()
315331 with transaction .atomic ():
316332 try :
317333 return URL .objects .get_or_create (
@@ -339,18 +355,17 @@ def get_or_create_from_values(
339355 path = None ,
340356 query = None ,
341357 fragment = None ,
342- host_validation_failure = None ,
343358 ) -> URL :
344- return URL . get_or_create_from_object ( URL (
359+ url = URL (
345360 protocol = protocol ,
346361 user_info = user_info ,
347362 host = host ,
348363 port = port ,
349364 path = path ,
350365 query = query ,
351366 fragment = fragment ,
352- host_validation_failure = host_validation_failure ,
353- ) )
367+ )
368+ return URL . get_or_create_from_object ( url )
354369
355370 @staticmethod
356371 def create_location_from_value (value : str ) -> URL :
@@ -365,9 +380,9 @@ def from_value(value: str) -> URL:
365380
366381 parsed_url = URL ._parse_string_value (value )
367382
368- path = parsed_url .path .lstrip ("/" )[:2048 ]
369- query = parsed_url .query [:2048 ]
370- fragment = parsed_url .fragment [:2048 ]
383+ path = parsed_url .path .removeprefix ("/" )[:2048 ]
384+ query = parsed_url .query . removeprefix ( "?" ) [:2048 ]
385+ fragment = parsed_url .fragment . removeprefix ( "#" ) [:2048 ]
371386
372387 # Create the initial object, assuming no exceptions are thrown
373388 url = URL (
@@ -379,5 +394,5 @@ def from_value(value: str) -> URL:
379394 query = query ,
380395 fragment = fragment ,
381396 )
382- url .normalize_url_parts ()
397+ url .clean ()
383398 return url
0 commit comments