Skip to content

Commit 42de008

Browse files
authored
Add file:// URL protocol support for $ref resolution (#2601)
* Add support for file URL scheme in schema fetching and validation * Add support for UNC paths in file URL handling * Add support for relative path resolution in file URL handling * Add fragment handling and improve file URL joining logic * Add tests for handling file URLs with UNC and local paths * Refactor tests to assert path components for file URL handling * Refactor tests to assert path components for file URL handling
1 parent 52288ff commit 42de008

7 files changed

Lines changed: 255 additions & 18 deletions

File tree

src/datamodel_code_generator/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def validate_url(cls, value: Any) -> ParseResult | None: # noqa: N805
166166
return urlparse(value)
167167
if value is None: # pragma: no cover
168168
return None
169-
msg = f"This protocol doesn't support only http/https. --input={value}" # pragma: no cover
169+
msg = f"Unsupported URL scheme. Supported: http, https, file. --input={value}" # pragma: no cover
170170
raise Error(msg) # pragma: no cover
171171

172172
# Pydantic 1.5.1 doesn't support each_item=True correctly
Lines changed: 59 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,26 @@
11
"""HTTP utilities for fetching remote schema files.
22
33
Provides functions to fetch schema content from URLs and join URL references.
4-
Requires the 'http' extra: `pip install 'datamodel-code-generator[http]'`.
4+
HTTP(S) URLs require the 'http' extra: `pip install 'datamodel-code-generator[http]'`.
5+
file:// URLs are handled without additional dependencies.
56
"""
67

78
from __future__ import annotations
89

9-
from typing import TYPE_CHECKING
10+
from typing import TYPE_CHECKING, Any
1011

1112
if TYPE_CHECKING:
1213
from collections.abc import Sequence
1314

14-
try:
15-
import httpx
16-
except ImportError as exc: # pragma: no cover
17-
msg = "Please run `$pip install 'datamodel-code-generator[http]`' to resolve URL Reference"
18-
raise Exception(msg) from exc # noqa: TRY002
15+
16+
def _get_httpx() -> Any:
17+
"""Lazily import httpx, raising a helpful error if not installed."""
18+
try:
19+
import httpx # noqa: PLC0415
20+
except ImportError as exc: # pragma: no cover
21+
msg = "Please run `$pip install 'datamodel-code-generator[http]`' to resolve HTTP(S) URL references"
22+
raise Exception(msg) from exc # noqa: TRY002
23+
return httpx
1924

2025

2126
def get_body(
@@ -25,6 +30,7 @@ def get_body(
2530
query_parameters: Sequence[tuple[str, str]] | None = None,
2631
) -> str:
2732
"""Fetch content from a URL with optional headers and query parameters."""
33+
httpx = _get_httpx()
2834
return httpx.get(
2935
url,
3036
headers=headers,
@@ -35,6 +41,51 @@ def get_body(
3541
).text
3642

3743

38-
def join_url(url: str, ref: str = ".") -> str:
44+
def join_url(url: str, ref: str = ".") -> str: # noqa: PLR0912
3945
"""Join a base URL with a relative reference."""
46+
if url.startswith("file://"):
47+
from urllib.parse import urlparse # noqa: PLC0415
48+
49+
parsed = urlparse(url)
50+
51+
if ref.startswith("file://"):
52+
return ref
53+
54+
ref_path, *frag = ref.split("#", 1)
55+
56+
# Fragment-only ref: keep the original path
57+
if not ref_path:
58+
joined = url.split("#", maxsplit=1)[0]
59+
if frag:
60+
joined += f"#{frag[0]}"
61+
return joined
62+
63+
if ref_path.startswith("/"):
64+
joined_path = ref_path
65+
else:
66+
base_segments = parsed.path.lstrip("/").split("/")
67+
if base_segments and not base_segments[0]:
68+
base_segments = []
69+
if base_segments:
70+
base_segments = base_segments[:-1]
71+
72+
min_depth = 1 if parsed.netloc else 0
73+
for segment in ref_path.split("/"):
74+
if segment in {"", "."}:
75+
continue
76+
if segment == "..":
77+
if len(base_segments) > min_depth:
78+
base_segments.pop()
79+
continue
80+
base_segments.append(segment)
81+
82+
joined_path = "/" + "/".join(base_segments)
83+
if ref_path.endswith("/"):
84+
joined_path += "/"
85+
86+
joined = f"file://{parsed.netloc}{joined_path}"
87+
if frag:
88+
joined += f"#{frag[0]}"
89+
return joined
90+
httpx = _get_httpx()
4091
return str(httpx.URL(url).join(ref))

src/datamodel_code_generator/parser/jsonschema.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1964,8 +1964,21 @@ def _get_ref_body(self, resolved_ref: str) -> dict[str, YamlValue]:
19641964
return self._get_ref_body_from_remote(resolved_ref)
19651965

19661966
def _get_ref_body_from_url(self, ref: str) -> dict[str, YamlValue]:
1967-
"""Get reference body from a URL."""
1968-
# URL Reference: $ref: 'http://path/to/your/resource' Uses the whole document located on the different server.
1967+
"""Get reference body from a URL (HTTP, HTTPS, or file scheme)."""
1968+
if ref.startswith("file://"):
1969+
from urllib.parse import urlparse # noqa: PLC0415
1970+
from urllib.request import url2pathname # noqa: PLC0415
1971+
1972+
parsed = urlparse(ref)
1973+
# url2pathname handles percent-decoding and Windows drive letters
1974+
path = url2pathname(parsed.path)
1975+
# Handle UNC paths (file://server/share/path)
1976+
if parsed.netloc:
1977+
path = f"//{parsed.netloc}{path}"
1978+
file_path = Path(path)
1979+
return self.remote_object_cache.get_or_put(
1980+
ref, default_factory=lambda _: load_yaml_dict_from_path(file_path, self.encoding)
1981+
)
19691982
return self.remote_object_cache.get_or_put(
19701983
ref, default_factory=lambda key: load_yaml_dict(self._get_text_from_url(key))
19711984
)

src/datamodel_code_generator/reference.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ def base_url_context(self, base_url: str | None) -> Generator[None, None, None]:
481481
"""Temporarily set the base URL within a context.
482482
483483
Only sets the base_url if:
484-
- The new value is actually a URL (http:// or https://)
484+
- The new value is actually a URL (http://, https://, or file://)
485485
- OR _base_url was already set (switching between URLs)
486486
This preserves backward compatibility for local file parsing where
487487
this method was previously a no-op.
@@ -858,5 +858,5 @@ def snake_to_upper_camel(word: str, delimiter: str = "_") -> str:
858858

859859

860860
def is_url(ref: str) -> bool:
861-
"""Check if a reference string is an HTTP(S) URL."""
862-
return ref.startswith(("https://", "http://"))
861+
"""Check if a reference string is a URL (HTTP, HTTPS, or file scheme)."""
862+
return ref.startswith(("https://", "http://", "file://"))

tests/main/jsonschema/test_main_jsonschema.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3085,3 +3085,86 @@ def test_main_jsonschema_collapse_root_models_with_optional(output_file: Path) -
30853085
assert_func=assert_file_content,
30863086
extra_args=["--collapse-root-models"],
30873087
)
3088+
3089+
3090+
def test_main_jsonschema_file_url_ref(tmp_path: Path) -> None:
3091+
"""Test that file:// URL $ref is resolved correctly."""
3092+
pet_schema = {
3093+
"$schema": "https://json-schema.org/draft/2020-12/schema",
3094+
"type": "object",
3095+
"properties": {"name": {"type": "string"}, "age": {"type": "integer"}},
3096+
"required": ["name"],
3097+
}
3098+
pet_file = tmp_path / "pet.json"
3099+
pet_file.write_text(json.dumps(pet_schema))
3100+
3101+
main_schema = {
3102+
"$schema": "https://json-schema.org/draft/2020-12/schema",
3103+
"type": "object",
3104+
"properties": {"pet": {"$ref": pet_file.as_uri()}},
3105+
}
3106+
main_file = tmp_path / "main.json"
3107+
main_file.write_text(json.dumps(main_schema))
3108+
3109+
expected = (
3110+
"# generated by datamodel-codegen:\n"
3111+
"# filename: main.json\n\n"
3112+
"from __future__ import annotations\n\n"
3113+
"from typing import Optional\n\n"
3114+
"from pydantic import BaseModel\n\n\n"
3115+
"class Pet(BaseModel):\n"
3116+
" name: str\n"
3117+
" age: Optional[int] = None\n\n\n"
3118+
"class Model(BaseModel):\n"
3119+
" pet: Optional[Pet] = None\n"
3120+
)
3121+
run_main_and_assert(
3122+
input_path=main_file,
3123+
output_path=tmp_path / "output.py",
3124+
input_file_type="jsonschema",
3125+
expected_output=expected,
3126+
ignore_whitespace=True,
3127+
extra_args=["--disable-timestamp"],
3128+
)
3129+
3130+
3131+
def test_main_jsonschema_file_url_ref_percent_encoded(tmp_path: Path) -> None:
3132+
"""Test that file:// URL with percent-encoded path is resolved correctly."""
3133+
dir_with_space = tmp_path / "my schemas"
3134+
dir_with_space.mkdir()
3135+
3136+
pet_schema = {
3137+
"$schema": "https://json-schema.org/draft/2020-12/schema",
3138+
"type": "object",
3139+
"properties": {"name": {"type": "string"}},
3140+
}
3141+
pet_file = dir_with_space / "pet.json"
3142+
pet_file.write_text(json.dumps(pet_schema))
3143+
3144+
main_schema = {
3145+
"$schema": "https://json-schema.org/draft/2020-12/schema",
3146+
"type": "object",
3147+
"properties": {"pet": {"$ref": pet_file.as_uri()}},
3148+
}
3149+
main_file = tmp_path / "main.json"
3150+
main_file.write_text(json.dumps(main_schema))
3151+
3152+
expected = (
3153+
"# generated by datamodel-codegen:\n"
3154+
"# filename: main.json\n\n"
3155+
"from __future__ import annotations\n\n"
3156+
"from typing import Optional\n\n"
3157+
"from pydantic import BaseModel\n\n\n"
3158+
"class Pet(BaseModel):\n"
3159+
" name: Optional[str] = None\n\n\n"
3160+
"class Model(BaseModel):\n"
3161+
" pet: Optional[Pet] = None\n"
3162+
)
3163+
run_main_and_assert(
3164+
input_path=main_file,
3165+
output_path=tmp_path / "output.py",
3166+
input_file_type="jsonschema",
3167+
expected_output=expected,
3168+
ignore_whitespace=True,
3169+
extra_args=["--disable-timestamp"],
3170+
)

tests/parser/test_jsonschema.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -807,3 +807,40 @@ def test_create_data_model_dataclass_arguments(
807807
result = parser._create_data_model(**kwargs)
808808
assert isinstance(result, DataClass)
809809
assert result.dataclass_arguments == expected
810+
811+
812+
def test_get_ref_body_from_url_file_unc_path(mocker: MockerFixture) -> None:
813+
"""Test _get_ref_body_from_url handles UNC file:// URLs correctly."""
814+
parser = JsonSchemaParser("")
815+
mock_load = mocker.patch(
816+
"datamodel_code_generator.parser.jsonschema.load_yaml_dict_from_path",
817+
return_value={"type": "object"},
818+
)
819+
820+
result = parser._get_ref_body_from_url("file://server/share/schemas/pet.json")
821+
822+
assert result == {"type": "object"}
823+
mock_load.assert_called_once()
824+
called_path = mock_load.call_args[0][0]
825+
# On Windows, UNC paths have \\server\share\ as a single "drive" part
826+
# On POSIX, they're separate: /, server, share, schemas, pet.json
827+
path_str = str(called_path)
828+
assert "server" in path_str
829+
assert "share" in path_str
830+
assert called_path.parts[-2:] == ("schemas", "pet.json")
831+
832+
833+
def test_get_ref_body_from_url_file_local_path(mocker: MockerFixture) -> None:
834+
"""Test _get_ref_body_from_url handles local file:// URLs (no netloc)."""
835+
parser = JsonSchemaParser("")
836+
mock_load = mocker.patch(
837+
"datamodel_code_generator.parser.jsonschema.load_yaml_dict_from_path",
838+
return_value={"type": "string"},
839+
)
840+
841+
result = parser._get_ref_body_from_url("file:///home/user/schemas/pet.json")
842+
843+
assert result == {"type": "string"}
844+
mock_load.assert_called_once()
845+
called_path = mock_load.call_args[0][0]
846+
assert called_path.parts[-4:] == ("home", "user", "schemas", "pet.json")

tests/test_reference.py

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import pytest
88

9+
from datamodel_code_generator.http import join_url
910
from datamodel_code_generator.reference import ModelResolver, get_relative_path, is_url
1011

1112

@@ -171,12 +172,15 @@ def test_resolve_ref_local_fragment_with_base_url() -> None:
171172
@pytest.mark.parametrize(
172173
("ref", "expected"),
173174
[
174-
# HTTP/HTTPS URLs (only supported schemes)
175+
# HTTP/HTTPS URLs
175176
("https://example.com/schema.json", True),
176177
("http://example.com/schema.json", True),
177178
("https://example.com/path/to/schema.json", True),
178-
# file:// URLs - NOT recognized (fetcher only supports HTTP)
179-
("file:///home/user/schema.json", False),
179+
# file:// URLs - recognized and handled via filesystem
180+
("file:///home/user/schema.json", True),
181+
("file:///C:/path/to/schema.json", True),
182+
("file://server/share/schema.json", True),
183+
# file:/ (single slash) - NOT recognized as valid file URL
180184
("file:/home/user/schema.json", False),
181185
# Other URL schemes - NOT recognized
182186
("ftp://example.com/schema.json", False),
@@ -194,7 +198,7 @@ def test_resolve_ref_local_fragment_with_base_url() -> None:
194198
],
195199
)
196200
def test_is_url(ref: str, expected: bool) -> None:
197-
"""Test is_url correctly identifies HTTP(S) URLs only."""
201+
"""Test is_url correctly identifies HTTP(S) and file:// URLs."""
198202
assert is_url(ref) == expected
199203

200204

@@ -207,3 +211,52 @@ def test_resolve_ref_with_root_id_differs_from_base_url() -> None:
207211
result = resolver.resolve_ref("../common/types.json")
208212

209213
assert result == "https://example.com/common/types.json#"
214+
215+
216+
@pytest.mark.parametrize(
217+
("base_url", "ref", "expected"),
218+
[
219+
# file:// URL joining - relative refs
220+
("file:///home/user/schemas/main.json", "../common/types.json", "file:///home/user/common/types.json"),
221+
("file:///home/user/schemas/main.json", "other.json", "file:///home/user/schemas/other.json"),
222+
("file:///home/user/schemas/main.json", "./sub/schema.json", "file:///home/user/schemas/sub/schema.json"),
223+
# file:// URL joining - absolute file:// refs
224+
("file:///home/user/schemas/main.json", "file:///other/schema.json", "file:///other/schema.json"),
225+
# file:// URL joining - absolute path refs (starts with /)
226+
("file:///home/user/schemas/main.json", "/absolute/path.json", "file:///absolute/path.json"),
227+
("file://server/share/main.json", "/absolute/path.json", "file://server/absolute/path.json"),
228+
# Windows-style file:// URLs
229+
("file:///C:/schemas/main.json", "../common/types.json", "file:///C:/common/types.json"),
230+
# UNC file:// URLs
231+
("file://server/share/main.json", "../common/types.json", "file://server/share/common/types.json"),
232+
("file://server/share/main.json", "child.json", "file://server/share/child.json"),
233+
# Fragment handling
234+
(
235+
"file:///home/user/schemas/main.json",
236+
"other.json#/definitions/Foo",
237+
"file:///home/user/schemas/other.json#/definitions/Foo",
238+
),
239+
(
240+
"file:///home/user/schemas/main.json",
241+
"#/definitions/Bar",
242+
"file:///home/user/schemas/main.json#/definitions/Bar",
243+
),
244+
# Multiple .. traversal - stops at root for non-UNC
245+
("file:///a/b/main.json", "../../../other.json", "file:///other.json"),
246+
# Multiple .. traversal - stops at share level for UNC (min_depth=1)
247+
("file://server/share/a/b/main.json", "../../../../other.json", "file://server/share/other.json"),
248+
# Empty and dot segments
249+
("file:///home/user/schemas/main.json", "./", "file:///home/user/schemas/"),
250+
("file:///home/user/schemas/main.json", "a//b/./c.json", "file:///home/user/schemas/a/b/c.json"),
251+
# Fragment-only ref without fragment content (just #)
252+
("file:///home/user/schemas/main.json", "#", "file:///home/user/schemas/main.json#"),
253+
# Empty ref (keeps base URL unchanged)
254+
("file:///home/user/schemas/main.json", "", "file:///home/user/schemas/main.json"),
255+
# Root directory base URL (triggers empty base_segments branch)
256+
("file:///", "schema.json", "file:///schema.json"),
257+
("file:///main.json", "../other.json", "file:///other.json"),
258+
],
259+
)
260+
def test_join_url_file_scheme(base_url: str, ref: str, expected: str) -> None:
261+
"""Test join_url correctly handles file:// URLs."""
262+
assert join_url(base_url, ref) == expected

0 commit comments

Comments
 (0)