Skip to content

Commit 48ae05b

Browse files
koxudaxigithub-actions[bot]pre-commit-ci[bot]
authored
Add --reuse-scope option for cross-file model deduplication (#2573)
* feat: add reuse scope for model deduplication across modules * fix: update default value for reuse scope argument to None * docs: update command help in README 🤖 Generated by GitHub Actions * test: add tests for --reuse-scope=tree functionality in JSON schema processing * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * feat: add shared module name option for reuse scope tree functionality * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * docs: update command help in README 🤖 Generated by GitHub Actions * feat: implement reuse scope tree functionality with model deduplication and shared module validation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refactor: streamline function definitions and improve code readability * fix: update shared module name help text for --reuse-scope=tree * fix: add error handling for duplicate and canonical models in reuse scope tree * fix: update documentation for reuse scope tree to reflect shared.py changes * feat: enhance reuse scope tree support for dataclasses and TypedDict --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent ba757db commit 48ae05b

75 files changed

Lines changed: 1227 additions & 2 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,13 @@ Model customization:
478478
Set name of models defined inline from the parent model
479479
--reuse-model Reuse models on the field when a module has the model with the same
480480
content
481+
--reuse-scope {module,tree}
482+
Scope for model reuse deduplication: module (per-file, default) or
483+
tree (cross-file with shared module). Only effective when --reuse-
484+
model is set.
485+
--shared-module-name SHARED_MODULE_NAME
486+
Name of the shared module for --reuse-scope=tree (default:
487+
"shared"). Use this option if your schema has a file named "shared".
481488
--skip-root-model Skip generating the model for the root schema element
482489
--target-python-version {3.9,3.10,3.11,3.12,3.13,3.14}
483490
target python version

docs/index.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,13 @@ Model customization:
470470
Set name of models defined inline from the parent model
471471
--reuse-model Reuse models on the field when a module has the model with the same
472472
content
473+
--reuse-scope {module,tree}
474+
Scope for model reuse deduplication: module (per-file, default) or
475+
tree (cross-file with shared module). Only effective when --reuse-
476+
model is set.
477+
--shared-module-name SHARED_MODULE_NAME
478+
Name of the shared module for --reuse-scope=tree (default:
479+
"shared"). Use this option if your schema has a file named "shared".
473480
--skip-root-model Skip generating the model for the root schema element
474481
--target-python-version {3.9,3.10,3.11,3.12,3.13,3.14}
475482
target python version

src/datamodel_code_generator/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353

5454
MIN_VERSION: Final[int] = 9
5555
MAX_VERSION: Final[int] = 13
56+
DEFAULT_SHARED_MODULE_NAME: Final[str] = "shared"
5657

5758
T = TypeVar("T")
5859

@@ -224,6 +225,17 @@ class DataModelType(Enum):
224225
MsgspecStruct = "msgspec.Struct"
225226

226227

228+
class ReuseScope(Enum):
229+
"""Scope for model reuse deduplication.
230+
231+
module: Deduplicate identical models within each module (default).
232+
tree: Deduplicate identical models across all modules, placing shared models in shared.py.
233+
"""
234+
235+
Module = "module"
236+
Tree = "tree"
237+
238+
227239
class OpenAPIScope(Enum):
228240
"""Scopes for OpenAPI model generation."""
229241

@@ -306,6 +318,8 @@ def generate( # noqa: PLR0912, PLR0913, PLR0914, PLR0915
306318
use_inline_field_description: bool = False,
307319
use_default_kwarg: bool = False,
308320
reuse_model: bool = False,
321+
reuse_scope: ReuseScope = ReuseScope.Module,
322+
shared_module_name: str = DEFAULT_SHARED_MODULE_NAME,
309323
encoding: str = "utf-8",
310324
enum_field_as_literal: LiteralType | None = None,
311325
use_one_literal_as_default: bool = False,
@@ -537,6 +551,8 @@ def get_header_and_first_line(csv_file: IO[str]) -> dict[str, Any]:
537551
use_inline_field_description=use_inline_field_description,
538552
use_default_kwarg=use_default_kwarg,
539553
reuse_model=reuse_model,
554+
reuse_scope=reuse_scope,
555+
shared_module_name=shared_module_name,
540556
enum_field_as_literal=LiteralType.All
541557
if output_model_type == DataModelType.TypingTypedDict
542558
else enum_field_as_literal,

src/datamodel_code_generator/__main__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@
1919
from typing_extensions import TypeAlias
2020

2121
from datamodel_code_generator import (
22+
DEFAULT_SHARED_MODULE_NAME,
2223
DataclassArguments,
2324
DataModelType,
2425
Error,
2526
InputFileType,
2627
InvalidClassNameError,
2728
OpenAPIScope,
29+
ReuseScope,
2830
enable_debug_message,
2931
generate,
3032
)
@@ -357,6 +359,8 @@ def validate_root(cls, values: dict[str, Any]) -> dict[str, Any]: # noqa: N805
357359
use_inline_field_description: bool = False
358360
use_default_kwarg: bool = False
359361
reuse_model: bool = False
362+
reuse_scope: ReuseScope = ReuseScope.Module
363+
shared_module_name: str = DEFAULT_SHARED_MODULE_NAME
360364
encoding: str = DEFAULT_ENCODING
361365
enum_field_as_literal: Optional[LiteralType] = None # noqa: UP045
362366
use_one_literal_as_default: bool = False
@@ -535,6 +539,13 @@ def main(args: Sequence[str] | None = None) -> Exit: # noqa: PLR0911, PLR0912,
535539

536540
if config.disable_warnings:
537541
warnings.simplefilter("ignore")
542+
543+
if config.reuse_scope == ReuseScope.Tree and not config.reuse_model:
544+
print( # noqa: T201
545+
"Warning: --reuse-scope=tree has no effect without --reuse-model",
546+
file=sys.stderr,
547+
)
548+
538549
extra_template_data: defaultdict[str, dict[str, Any]] | None
539550
if config.extra_template_data is None:
540551
extra_template_data = None
@@ -616,6 +627,8 @@ def main(args: Sequence[str] | None = None) -> Exit: # noqa: PLR0911, PLR0912,
616627
use_inline_field_description=config.use_inline_field_description,
617628
use_default_kwarg=config.use_default_kwarg,
618629
reuse_model=config.reuse_model,
630+
reuse_scope=config.reuse_scope,
631+
shared_module_name=config.shared_module_name,
619632
encoding=config.encoding,
620633
enum_field_as_literal=config.enum_field_as_literal,
621634
use_one_literal_as_default=config.use_one_literal_as_default,

src/datamodel_code_generator/arguments.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,14 @@
1414
from pathlib import Path
1515
from typing import TYPE_CHECKING, cast
1616

17-
from datamodel_code_generator import DataclassArguments, DataModelType, InputFileType, OpenAPIScope
17+
from datamodel_code_generator import (
18+
DEFAULT_SHARED_MODULE_NAME,
19+
DataclassArguments,
20+
DataModelType,
21+
InputFileType,
22+
OpenAPIScope,
23+
ReuseScope,
24+
)
1825
from datamodel_code_generator.format import DatetimeClassType, Formatter, PythonVersion
1926
from datamodel_code_generator.model.pydantic_v2 import UnionMode
2027
from datamodel_code_generator.parser import LiteralType
@@ -224,6 +231,19 @@ def start_section(self, heading: str | None) -> None:
224231
action="store_true",
225232
default=None,
226233
)
234+
model_options.add_argument(
235+
"--reuse-scope",
236+
help="Scope for model reuse deduplication: module (per-file, default) or tree (cross-file with shared module). "
237+
"Only effective when --reuse-model is set.",
238+
choices=[s.value for s in ReuseScope],
239+
default=None,
240+
)
241+
model_options.add_argument(
242+
"--shared-module-name",
243+
help=f'Name of the shared module for --reuse-scope=tree (default: "{DEFAULT_SHARED_MODULE_NAME}"). '
244+
f'Use this option if your schema has a file named "{DEFAULT_SHARED_MODULE_NAME}".',
245+
default=None,
246+
)
227247
model_options.add_argument(
228248
"--target-python-version",
229249
help="target python version",

src/datamodel_code_generator/parser/base.py

Lines changed: 137 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
from pydantic import BaseModel
2222

23+
from datamodel_code_generator import DEFAULT_SHARED_MODULE_NAME, Error, ReuseScope
2324
from datamodel_code_generator.format import (
2425
DEFAULT_FORMATTERS,
2526
CodeFormatter,
@@ -405,6 +406,8 @@ def __init__( # noqa: PLR0913, PLR0915
405406
use_inline_field_description: bool = False,
406407
use_default_kwarg: bool = False,
407408
reuse_model: bool = False,
409+
reuse_scope: ReuseScope | None = None,
410+
shared_module_name: str = DEFAULT_SHARED_MODULE_NAME,
408411
encoding: str = "utf-8",
409412
enum_field_as_literal: LiteralType | None = None,
410413
set_default_enum_member: bool = False,
@@ -495,6 +498,8 @@ def __init__( # noqa: PLR0913, PLR0915
495498
self.use_inline_field_description: bool = use_inline_field_description
496499
self.use_default_kwarg: bool = use_default_kwarg
497500
self.reuse_model: bool = reuse_model
501+
self.reuse_scope: ReuseScope | None = reuse_scope
502+
self.shared_module_name: str = shared_module_name
498503
self.encoding: str = encoding
499504
self.enum_field_as_literal: LiteralType | None = enum_field_as_literal
500505
self.set_default_enum_member: bool = set_default_enum_member
@@ -1002,7 +1007,7 @@ def __set_reference_default_value_to_field(cls, models: list[DataModel]) -> None
10021007
model_field.default = model_field.data_type.reference.source.default
10031008

10041009
def __reuse_model(self, models: list[DataModel], require_update_action_models: list[str]) -> None:
1005-
if not self.reuse_model:
1010+
if not self.reuse_model or self.reuse_scope == ReuseScope.Tree:
10061011
return
10071012
model_cache: dict[tuple[HashableComparable, ...], Reference] = {}
10081013
duplicates = []
@@ -1041,6 +1046,133 @@ def __reuse_model(self, models: list[DataModel], require_update_action_models: l
10411046
for duplicate in duplicates:
10421047
models.remove(duplicate)
10431048

1049+
def __find_duplicate_models_across_modules( # noqa: PLR6301
1050+
self,
1051+
module_models: list[tuple[tuple[str, ...], list[DataModel]]],
1052+
) -> list[tuple[tuple[str, ...], DataModel, tuple[str, ...], DataModel]]:
1053+
"""Find duplicate models across all modules by comparing render output and imports."""
1054+
all_models: list[tuple[tuple[str, ...], DataModel]] = []
1055+
for module, models in module_models:
1056+
all_models.extend((module, model) for model in models)
1057+
1058+
model_cache: dict[tuple[HashableComparable, ...], tuple[tuple[str, ...], DataModel]] = {}
1059+
duplicates: list[tuple[tuple[str, ...], DataModel, tuple[str, ...], DataModel]] = []
1060+
1061+
for module, model in all_models:
1062+
model_key = tuple(to_hashable(v) for v in (model.render(class_name="M"), model.imports))
1063+
cached = model_cache.get(model_key)
1064+
if cached:
1065+
canonical_module, canonical_model = cached
1066+
duplicates.append((module, model, canonical_module, canonical_model))
1067+
else:
1068+
model_cache[model_key] = (module, model)
1069+
1070+
return duplicates
1071+
1072+
def __validate_shared_module_name(
1073+
self,
1074+
module_models: list[tuple[tuple[str, ...], list[DataModel]]],
1075+
) -> None:
1076+
"""Validate that the shared module name doesn't conflict with existing modules."""
1077+
shared_module = self.shared_module_name
1078+
existing_module_names = {module[0] for module, _ in module_models}
1079+
if shared_module in existing_module_names:
1080+
msg = (
1081+
f"Schema file or directory '{shared_module}' conflicts with the shared module name. "
1082+
f"Use --shared-module-name to specify a different name."
1083+
)
1084+
raise Error(msg)
1085+
1086+
def __create_shared_module_from_duplicates( # noqa: PLR0912
1087+
self,
1088+
module_models: list[tuple[tuple[str, ...], list[DataModel]]],
1089+
duplicates: list[tuple[tuple[str, ...], DataModel, tuple[str, ...], DataModel]],
1090+
require_update_action_models: list[str],
1091+
) -> tuple[tuple[str, ...], list[DataModel]]:
1092+
"""Create shared module with canonical models and replace duplicates with inherited models."""
1093+
shared_module = self.shared_module_name
1094+
1095+
shared_models: list[DataModel] = []
1096+
canonical_to_shared_ref: dict[DataModel, Reference] = {}
1097+
canonical_models_seen: set[DataModel] = set()
1098+
1099+
# Process in order of first appearance in duplicates to ensure stable ordering
1100+
for _, _, _, canonical in duplicates:
1101+
if canonical in canonical_models_seen:
1102+
continue
1103+
canonical_models_seen.add(canonical)
1104+
canonical.file_path = Path(f"{shared_module}.py")
1105+
canonical_to_shared_ref[canonical] = canonical.reference
1106+
shared_models.append(canonical)
1107+
1108+
supports_inheritance = issubclass(
1109+
self.data_model_type,
1110+
(
1111+
pydantic_model.BaseModel,
1112+
pydantic_model_v2.BaseModel,
1113+
dataclass_model.DataClass,
1114+
),
1115+
)
1116+
1117+
for duplicate_module, duplicate_model, _, canonical_model in duplicates:
1118+
shared_ref = canonical_to_shared_ref[canonical_model]
1119+
for module, models in module_models:
1120+
if module != duplicate_module or duplicate_model not in models:
1121+
continue
1122+
if isinstance(duplicate_model, Enum) or not supports_inheritance:
1123+
for child in duplicate_model.reference.children[:]:
1124+
data_model = get_most_of_parent(child)
1125+
if data_model in models and isinstance(child, DataType):
1126+
child.replace_reference(shared_ref)
1127+
models.remove(duplicate_model)
1128+
else:
1129+
index = models.index(duplicate_model)
1130+
inherited_model = duplicate_model.__class__(
1131+
fields=[],
1132+
base_classes=[shared_ref],
1133+
description=duplicate_model.description,
1134+
reference=Reference(
1135+
name=duplicate_model.name,
1136+
path=duplicate_model.reference.path + "/reuse",
1137+
),
1138+
custom_template_dir=duplicate_model._custom_template_dir, # noqa: SLF001
1139+
)
1140+
if shared_ref.path in require_update_action_models:
1141+
add_model_path_to_list(require_update_action_models, inherited_model)
1142+
models.insert(index, inherited_model)
1143+
models.remove(duplicate_model)
1144+
break
1145+
else: # pragma: no cover
1146+
msg = f"Duplicate model {duplicate_model.name} not found in module {duplicate_module}"
1147+
raise RuntimeError(msg)
1148+
1149+
for canonical in canonical_models_seen:
1150+
for _module, models in module_models:
1151+
if canonical in models:
1152+
models.remove(canonical)
1153+
break
1154+
else: # pragma: no cover
1155+
msg = f"Canonical model {canonical.name} not found in any module"
1156+
raise RuntimeError(msg)
1157+
1158+
return (shared_module,), shared_models
1159+
1160+
def __reuse_model_tree_scope(
1161+
self,
1162+
module_models: list[tuple[tuple[str, ...], list[DataModel]]],
1163+
require_update_action_models: list[str],
1164+
) -> tuple[tuple[str, ...], list[DataModel]] | None:
1165+
"""Deduplicate models across all modules, placing shared models in shared.py."""
1166+
if not self.reuse_model or self.reuse_scope != ReuseScope.Tree:
1167+
return None
1168+
1169+
duplicates = self.__find_duplicate_models_across_modules(module_models)
1170+
if not duplicates:
1171+
return None
1172+
1173+
self.__validate_shared_module_name(module_models)
1174+
return self.__create_shared_module_from_duplicates(module_models, duplicates, require_update_action_models)
1175+
10441176
def __collapse_root_models( # noqa: PLR0912
10451177
self,
10461178
models: list[DataModel],
@@ -1499,6 +1631,10 @@ def sort_key(data_model: DataModel) -> tuple[int, tuple[str, ...]]:
14991631
))
15001632
previous_module = module
15011633

1634+
shared_module_entry = self.__reuse_model_tree_scope(module_models, require_update_action_models)
1635+
if shared_module_entry:
1636+
module_models.insert(0, shared_module_entry)
1637+
15021638
class Processed(NamedTuple):
15031639
module: tuple[str, ...]
15041640
models: list[DataModel]

src/datamodel_code_generator/parser/graphql.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@
1515
from urllib.parse import ParseResult
1616

1717
from datamodel_code_generator import (
18+
DEFAULT_SHARED_MODULE_NAME,
1819
DataclassArguments,
1920
DefaultPutDict,
2021
LiteralType,
2122
PythonVersion,
2223
PythonVersionMin,
24+
ReuseScope,
2325
snooper_to_methods,
2426
)
2527
from datamodel_code_generator.format import DEFAULT_FORMATTERS, DatetimeClassType, Formatter
@@ -126,6 +128,8 @@ def __init__( # noqa: PLR0913
126128
use_inline_field_description: bool = False,
127129
use_default_kwarg: bool = False,
128130
reuse_model: bool = False,
131+
reuse_scope: ReuseScope | None = None,
132+
shared_module_name: str = DEFAULT_SHARED_MODULE_NAME,
129133
encoding: str = "utf-8",
130134
enum_field_as_literal: LiteralType | None = None,
131135
set_default_enum_member: bool = False,
@@ -211,6 +215,8 @@ def __init__( # noqa: PLR0913
211215
use_inline_field_description=use_inline_field_description,
212216
use_default_kwarg=use_default_kwarg,
213217
reuse_model=reuse_model,
218+
reuse_scope=reuse_scope,
219+
shared_module_name=shared_module_name,
214220
encoding=encoding,
215221
enum_field_as_literal=enum_field_as_literal,
216222
use_one_literal_as_default=use_one_literal_as_default,

src/datamodel_code_generator/parser/jsonschema.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@
2020
)
2121

2222
from datamodel_code_generator import (
23+
DEFAULT_SHARED_MODULE_NAME,
2324
DataclassArguments,
2425
InvalidClassNameError,
26+
ReuseScope,
2527
YamlValue,
2628
load_yaml,
2729
load_yaml_dict,
@@ -478,6 +480,8 @@ def __init__( # noqa: PLR0913
478480
use_inline_field_description: bool = False,
479481
use_default_kwarg: bool = False,
480482
reuse_model: bool = False,
483+
reuse_scope: ReuseScope | None = None,
484+
shared_module_name: str = DEFAULT_SHARED_MODULE_NAME,
481485
encoding: str = "utf-8",
482486
enum_field_as_literal: LiteralType | None = None,
483487
use_one_literal_as_default: bool = False,
@@ -564,6 +568,8 @@ def __init__( # noqa: PLR0913
564568
use_inline_field_description=use_inline_field_description,
565569
use_default_kwarg=use_default_kwarg,
566570
reuse_model=reuse_model,
571+
reuse_scope=reuse_scope,
572+
shared_module_name=shared_module_name,
567573
encoding=encoding,
568574
enum_field_as_literal=enum_field_as_literal,
569575
use_one_literal_as_default=use_one_literal_as_default,

0 commit comments

Comments
 (0)