From c39e0b3407a4cf6dccc7c28314733b3191388ab7 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 15:59:52 +0800 Subject: [PATCH 1/5] feat: initial release --- Dockerfile | 21 +++ src/ydata_profiling/config.py | 142 ++++++++---------- src/ydata_profiling/model/handler.py | 20 +-- src/ydata_profiling/model/summarizer.py | 31 ++-- src/ydata_profiling/profile_report.py | 6 +- .../report/structure/__init__.py | 22 +++ .../report/structure/report.py | 2 +- 7 files changed, 130 insertions(+), 114 deletions(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..7bb15bf5d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \ + pip install --no-cache-dir . && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \ + pip install --no-cache-dir jupyter + +EXPOSE 8888 + +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"] + + diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 09dbecdde..2bb934ed1 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -6,24 +6,7 @@ import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr - -def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: - """ - Recursive merge dictionaries. - - :param dict1: Base dictionary to merge. - :param dict2: Dictionary to merge on top of base dictionary. - :return: Merged dictionary - """ - for key, val in dict1.items(): - if isinstance(val, dict): - dict2_node = dict2.setdefault(key, {}) - _merge_dictionaries(val, dict2_node) - else: - if key not in dict2: - dict2[key] = val - - return dict2 +from ydata_profiling.utils.common import update class Dataset(BaseModel): @@ -355,60 +338,7 @@ class Config: html: Html = Html() notebook: Notebook = Notebook() - def update(self, updates: dict) -> "Settings": - update = _merge_dictionaries(self.dict(), updates) - return self.parse_obj(self.copy(update=update)) - - @staticmethod - def from_file(config_file: Union[Path, str]) -> "Settings": - """Create a Settings object from a yaml file. - - Args: - config_file: yaml file path - Returns: - Settings - """ - with open(config_file) as f: - data = yaml.safe_load(f) - - return Settings.parse_obj(data) - - -class SparkSettings(Settings): - """ - Setting class with the standard report configuration for Spark DataFrames - All the supported analysis are set to true - """ - - vars: Univariate = Univariate() - - vars.num.low_categorical_threshold = 0 - - infer_dtypes: bool = False - - correlations: Dict[str, Correlation] = { - "spearman": Correlation(key="spearman", calculate=True), - "pearson": Correlation(key="pearson", calculate=True), - } - - correlation_table: bool = True - - interactions: Interactions = Interactions() - interactions.continuous = False - - missing_diagrams: Dict[str, bool] = { - "bar": False, - "matrix": False, - "dendrogram": False, - "heatmap": False, - } - samples: Samples = Samples() - samples.tail = 0 - samples.random = 0 - - -class Config: - arg_groups: Dict[str, Any] = { + _arg_groups: Dict[str, Any] = { "sensitive": { "samples": None, "duplicates": None, @@ -475,8 +405,8 @@ class Config: @staticmethod def get_arg_groups(key: str) -> dict: - kwargs = Config.arg_groups[key] - shorthand_args, _ = Config.shorthands(kwargs, split=False) + kwargs = Settings._arg_groups[key] + shorthand_args, _ = Settings.shorthands(kwargs, split=False) return shorthand_args @staticmethod @@ -485,8 +415,8 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: if not split: shorthand_args = kwargs for key, value in list(kwargs.items()): - if value is None and key in Config._shorthands: - shorthand_args[key] = Config._shorthands[key] + if value is None and key in Settings._shorthands: + shorthand_args[key] = Settings._shorthands[key] if split: del kwargs[key] @@ -494,3 +424,63 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: return shorthand_args, kwargs else: return shorthand_args, {} + + def update(self, updates: dict) -> "Settings": + merged = update(self.dict().copy(), updates) + return self.parse_obj(self.copy(update=merged)) + + @staticmethod + def from_file(config_file: Union[Path, str]) -> "Settings": + """Create a Settings object from a yaml file. + + Args: + config_file: yaml file path + Returns: + Settings + """ + with open(config_file) as f: + data = yaml.safe_load(f) + + return Settings.parse_obj(data) + + +class SparkSettings(Settings): + """ + Setting class with the standard report configuration for Spark DataFrames + All the supported analysis are set to true + """ + + vars: Univariate = Univariate() + + vars.num.low_categorical_threshold = 0 + + infer_dtypes: bool = False + + correlations: Dict[str, Correlation] = { + "spearman": Correlation(key="spearman", calculate=True), + "pearson": Correlation(key="pearson", calculate=True), + } + + correlation_table: bool = True + + interactions: Interactions = Interactions() + interactions.continuous = False + + missing_diagrams: Dict[str, bool] = { + "bar": False, + "matrix": False, + "dendrogram": False, + "heatmap": False, + } + samples: Samples = Samples() + samples.tail = 0 + samples.random = 0 + + +class Config(Settings): + """ + Deprecated: Use Settings instead. + Backward compatibility alias for Settings class. + """ + + pass diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 992c1840c..e983ce2a1 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -60,22 +60,6 @@ def handle(self, dtype: str, *args, **kwargs) -> dict: return summary -def get_render_map() -> Dict[str, Callable]: - import ydata_profiling.report.structure.variables as render_algorithms +from ydata_profiling.report.structure import get_render_map - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map +__all__ = ["compose", "Handler", "get_render_map"] diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..a57ed1c97 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,9 +50,8 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """A summarizer supporting both Pandas and Spark DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() @@ -65,7 +64,15 @@ def summary_map(self) -> Dict[str, List[Callable]]: return self._summary_map def _create_summary_map(self) -> Dict[str, List[Callable]]: - """Creates the summary map for Pandas summarization.""" + """Creates the summary map based on the backend.""" + common_map = { + "URL": [describe_url_1d], + "Path": [describe_path_1d], + "File": [describe_file_1d], + "Image": [describe_image_1d], + "TimeSeries": [describe_timeseries_1d], + } + if self.use_spark: from ydata_profiling.model.spark import ( describe_boolean_1d_spark, @@ -78,7 +85,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: describe_text_1d_spark, ) - summary_map = { + base_map = { "Unsupported": [ describe_counts_spark, describe_generic_spark, @@ -89,14 +96,9 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [describe_text_1d_spark], "Categorical": [describe_categorical_1d_spark], "Boolean": [describe_boolean_1d_spark], - "URL": [describe_url_1d], - "Path": [describe_path_1d], - "File": [describe_file_1d], - "Image": [describe_image_1d], - "TimeSeries": [describe_timeseries_1d], } else: - summary_map = { + base_map = { "Unsupported": [ pandas_describe_counts, pandas_describe_generic, @@ -107,13 +109,10 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [pandas_describe_text_1d], "Categorical": [pandas_describe_categorical_1d], "Boolean": [pandas_describe_boolean_1d], - "URL": [pandas_describe_url_1d], - "Path": [pandas_describe_path_1d], - "File": [pandas_describe_file_1d], - "Image": [pandas_describe_image_1d], - "TimeSeries": [pandas_describe_timeseries_1d], } - return summary_map + + base_map.update(common_map) + return base_map def format_summary(summary: Union[BaseDescription, dict]) -> dict: diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index a7d6d9134..916b4681e 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -25,7 +25,7 @@ from typeguard import typechecked from visions import VisionsTypeset -from ydata_profiling.config import Config, Settings, SparkSettings +from ydata_profiling.config import Settings, SparkSettings from ydata_profiling.expectations_report import ExpectationsReport from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType @@ -132,11 +132,11 @@ def __init__( cfg = Settings() for condition, key in groups: if condition: - cfg = cfg.update(Config.get_arg_groups(key)) + cfg = cfg.update(Settings.get_arg_groups(key)) report_config = report_config.update(cfg.dict(exclude_defaults=True)) if len(kwargs) > 0: - shorthands, kwargs = Config.shorthands(kwargs) + shorthands, kwargs = Settings.shorthands(kwargs) report_config = report_config.update( Settings().update(shorthands).dict(exclude_defaults=True) ) diff --git a/src/ydata_profiling/report/structure/__init__.py b/src/ydata_profiling/report/structure/__init__.py index 8324d248d..a2efd029a 100644 --- a/src/ydata_profiling/report/structure/__init__.py +++ b/src/ydata_profiling/report/structure/__init__.py @@ -1 +1,23 @@ """Data structure for the report""" +from typing import Callable, Dict + + +def get_render_map() -> Dict[str, Callable]: + import ydata_profiling.report.structure.variables as render_algorithms + + render_map = { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + return render_map diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 482b410b2..b64a41aae 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -7,7 +7,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.model.handler import get_render_map +from ydata_profiling.report.structure import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, From 27a314be64b586f58de6a2956d456e2a3d03da1f Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 17:14:23 +0800 Subject: [PATCH 2/5] feat: initial release --- src/ydata_profiling/config.py | 142 +++++++++++++----------- src/ydata_profiling/model/handler.py | 2 - src/ydata_profiling/model/summarizer.py | 31 +++--- src/ydata_profiling/profile_report.py | 6 +- 4 files changed, 95 insertions(+), 86 deletions(-) diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 2bb934ed1..09dbecdde 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -6,7 +6,24 @@ import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr -from ydata_profiling.utils.common import update + +def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: + """ + Recursive merge dictionaries. + + :param dict1: Base dictionary to merge. + :param dict2: Dictionary to merge on top of base dictionary. + :return: Merged dictionary + """ + for key, val in dict1.items(): + if isinstance(val, dict): + dict2_node = dict2.setdefault(key, {}) + _merge_dictionaries(val, dict2_node) + else: + if key not in dict2: + dict2[key] = val + + return dict2 class Dataset(BaseModel): @@ -338,7 +355,60 @@ class Config: html: Html = Html() notebook: Notebook = Notebook() - _arg_groups: Dict[str, Any] = { + def update(self, updates: dict) -> "Settings": + update = _merge_dictionaries(self.dict(), updates) + return self.parse_obj(self.copy(update=update)) + + @staticmethod + def from_file(config_file: Union[Path, str]) -> "Settings": + """Create a Settings object from a yaml file. + + Args: + config_file: yaml file path + Returns: + Settings + """ + with open(config_file) as f: + data = yaml.safe_load(f) + + return Settings.parse_obj(data) + + +class SparkSettings(Settings): + """ + Setting class with the standard report configuration for Spark DataFrames + All the supported analysis are set to true + """ + + vars: Univariate = Univariate() + + vars.num.low_categorical_threshold = 0 + + infer_dtypes: bool = False + + correlations: Dict[str, Correlation] = { + "spearman": Correlation(key="spearman", calculate=True), + "pearson": Correlation(key="pearson", calculate=True), + } + + correlation_table: bool = True + + interactions: Interactions = Interactions() + interactions.continuous = False + + missing_diagrams: Dict[str, bool] = { + "bar": False, + "matrix": False, + "dendrogram": False, + "heatmap": False, + } + samples: Samples = Samples() + samples.tail = 0 + samples.random = 0 + + +class Config: + arg_groups: Dict[str, Any] = { "sensitive": { "samples": None, "duplicates": None, @@ -405,8 +475,8 @@ class Config: @staticmethod def get_arg_groups(key: str) -> dict: - kwargs = Settings._arg_groups[key] - shorthand_args, _ = Settings.shorthands(kwargs, split=False) + kwargs = Config.arg_groups[key] + shorthand_args, _ = Config.shorthands(kwargs, split=False) return shorthand_args @staticmethod @@ -415,8 +485,8 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: if not split: shorthand_args = kwargs for key, value in list(kwargs.items()): - if value is None and key in Settings._shorthands: - shorthand_args[key] = Settings._shorthands[key] + if value is None and key in Config._shorthands: + shorthand_args[key] = Config._shorthands[key] if split: del kwargs[key] @@ -424,63 +494,3 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: return shorthand_args, kwargs else: return shorthand_args, {} - - def update(self, updates: dict) -> "Settings": - merged = update(self.dict().copy(), updates) - return self.parse_obj(self.copy(update=merged)) - - @staticmethod - def from_file(config_file: Union[Path, str]) -> "Settings": - """Create a Settings object from a yaml file. - - Args: - config_file: yaml file path - Returns: - Settings - """ - with open(config_file) as f: - data = yaml.safe_load(f) - - return Settings.parse_obj(data) - - -class SparkSettings(Settings): - """ - Setting class with the standard report configuration for Spark DataFrames - All the supported analysis are set to true - """ - - vars: Univariate = Univariate() - - vars.num.low_categorical_threshold = 0 - - infer_dtypes: bool = False - - correlations: Dict[str, Correlation] = { - "spearman": Correlation(key="spearman", calculate=True), - "pearson": Correlation(key="pearson", calculate=True), - } - - correlation_table: bool = True - - interactions: Interactions = Interactions() - interactions.continuous = False - - missing_diagrams: Dict[str, bool] = { - "bar": False, - "matrix": False, - "dendrogram": False, - "heatmap": False, - } - samples: Samples = Samples() - samples.tail = 0 - samples.random = 0 - - -class Config(Settings): - """ - Deprecated: Use Settings instead. - Backward compatibility alias for Settings class. - """ - - pass diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index e983ce2a1..bcca12a1c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -60,6 +60,4 @@ def handle(self, dtype: str, *args, **kwargs) -> dict: return summary -from ydata_profiling.report.structure import get_render_map -__all__ = ["compose", "Handler", "get_render_map"] diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index a57ed1c97..d733a7d36 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,8 +50,9 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) +# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer supporting both Pandas and Spark DataFrames.""" + """A summarizer for Pandas DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() @@ -64,15 +65,7 @@ def summary_map(self) -> Dict[str, List[Callable]]: return self._summary_map def _create_summary_map(self) -> Dict[str, List[Callable]]: - """Creates the summary map based on the backend.""" - common_map = { - "URL": [describe_url_1d], - "Path": [describe_path_1d], - "File": [describe_file_1d], - "Image": [describe_image_1d], - "TimeSeries": [describe_timeseries_1d], - } - + """Creates the summary map for Pandas summarization.""" if self.use_spark: from ydata_profiling.model.spark import ( describe_boolean_1d_spark, @@ -85,7 +78,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: describe_text_1d_spark, ) - base_map = { + summary_map = { "Unsupported": [ describe_counts_spark, describe_generic_spark, @@ -96,9 +89,14 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [describe_text_1d_spark], "Categorical": [describe_categorical_1d_spark], "Boolean": [describe_boolean_1d_spark], + "URL": [describe_url_1d], + "Path": [describe_path_1d], + "File": [describe_file_1d], + "Image": [describe_image_1d], + "TimeSeries": [describe_timeseries_1d], } else: - base_map = { + summary_map = { "Unsupported": [ pandas_describe_counts, pandas_describe_generic, @@ -109,10 +107,13 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [pandas_describe_text_1d], "Categorical": [pandas_describe_categorical_1d], "Boolean": [pandas_describe_boolean_1d], + "URL": [pandas_describe_url_1d], + "Path": [pandas_describe_path_1d], + "File": [pandas_describe_file_1d], + "Image": [pandas_describe_image_1d], + "TimeSeries": [pandas_describe_timeseries_1d], } - - base_map.update(common_map) - return base_map + return summary_map def format_summary(summary: Union[BaseDescription, dict]) -> dict: diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index 916b4681e..a7d6d9134 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -25,7 +25,7 @@ from typeguard import typechecked from visions import VisionsTypeset -from ydata_profiling.config import Settings, SparkSettings +from ydata_profiling.config import Config, Settings, SparkSettings from ydata_profiling.expectations_report import ExpectationsReport from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType @@ -132,11 +132,11 @@ def __init__( cfg = Settings() for condition, key in groups: if condition: - cfg = cfg.update(Settings.get_arg_groups(key)) + cfg = cfg.update(Config.get_arg_groups(key)) report_config = report_config.update(cfg.dict(exclude_defaults=True)) if len(kwargs) > 0: - shorthands, kwargs = Settings.shorthands(kwargs) + shorthands, kwargs = Config.shorthands(kwargs) report_config = report_config.update( Settings().update(shorthands).dict(exclude_defaults=True) ) From 8d8f6b71b5f46178749d0b100ba9bc8cefbfb261 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 17:39:22 +0800 Subject: [PATCH 3/5] feat: initial release --- src/ydata_profiling/model/handler.py | 123 +++++++++--------- .../report/structure/__init__.py | 8 ++ src/ydata_profiling/utils/backend.py | 2 +- 3 files changed, 69 insertions(+), 64 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index bcca12a1c..aa36a811c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,63 +1,60 @@ -""" - Auxiliary handler methods for data summary extraction -""" -from typing import Any, Callable, Dict, List, Sequence - -import networkx as nx -from visions import VisionsTypeset - - -def compose(functions: Sequence[Callable]) -> Callable: - """ - Compose a sequence of functions. - - :param functions: sequence of functions - :return: combined function applying all functions in order. - """ - - def composed_function(*args) -> List[Any]: - result = args # Start with the input arguments - for func in functions: - result = func(*result) if isinstance(result, tuple) else func(result) - return result # type: ignore - - return composed_function # type: ignore - - -class Handler: - """A generic handler - - Allows any custom mapping between data types and functions - """ - - def __init__( - self, - mapping: Dict[str, List[Callable]], - typeset: VisionsTypeset, - *args, - **kwargs - ): - self.mapping = mapping - self.typeset = typeset - self._complete_dag() - - def _complete_dag(self) -> None: - for from_type, to_type in nx.topological_sort( - nx.line_graph(self.typeset.base_graph) - ): - self.mapping[str(to_type)] = ( - self.mapping[str(from_type)] + self.mapping[str(to_type)] - ) - - def handle(self, dtype: str, *args, **kwargs) -> dict: - """ - Returns: - object: a tuple containing the config, the dataset series and the summary extracted - """ - funcs = self.mapping.get(dtype, []) - op = compose(funcs) - summary = op(*args)[-1] - return summary - - - +""" + Auxiliary handler methods for data summary extraction +""" +from typing import Any, Callable, Dict, List, Sequence + +import networkx as nx +from visions import VisionsTypeset + + +def compose(functions: Sequence[Callable]) -> Callable: + """ + Compose a sequence of functions. + + :param functions: sequence of functions + :return: combined function applying all functions in order. + """ + + def composed_function(*args) -> List[Any]: + result = args # Start with the input arguments + for func in functions: + result = func(*result) if isinstance(result, tuple) else func(result) + return result # type: ignore + + return composed_function # type: ignore + + +class Handler: + """A generic handler + + Allows any custom mapping between data types and functions + """ + + def __init__( + self, + mapping: Dict[str, List[Callable]], + typeset: VisionsTypeset, + *args, + **kwargs + ): + self.mapping = mapping + self.typeset = typeset + self._complete_dag() + + def _complete_dag(self) -> None: + for from_type, to_type in nx.topological_sort( + nx.line_graph(self.typeset.base_graph) + ): + self.mapping[str(to_type)] = ( + self.mapping[str(from_type)] + self.mapping[str(to_type)] + ) + + def handle(self, dtype: str, *args, **kwargs) -> dict: + """ + Returns: + object: a tuple containing the config, the dataset series and the summary extracted + """ + funcs = self.mapping.get(dtype, []) + op = compose(funcs) + summary = op(*args)[-1] + return summary diff --git a/src/ydata_profiling/report/structure/__init__.py b/src/ydata_profiling/report/structure/__init__.py index a2efd029a..7ba9c10c9 100644 --- a/src/ydata_profiling/report/structure/__init__.py +++ b/src/ydata_profiling/report/structure/__init__.py @@ -3,6 +3,14 @@ def get_render_map() -> Dict[str, Callable]: + """Get the mapping of variable types to their render functions. + + This function was moved from model.handler to report.structure to eliminate + the reverse dependency from model layer to report layer. + + Returns: + Dictionary mapping type names to render functions. + """ import ydata_profiling.report.structure.variables as render_algorithms render_map = { diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py index e99d91c11..dd12f9fd3 100644 --- a/src/ydata_profiling/utils/backend.py +++ b/src/ydata_profiling/utils/backend.py @@ -1,5 +1,5 @@ """ - File with a function to check the backend being used +Backend detection utilities for pandas and spark. """ import importlib From 307cba98bfab9196a8de5355022f3919539e4520 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 18:09:37 +0800 Subject: [PATCH 4/5] feat: initial release --- src/ydata_profiling/model/handler.py | 123 +++++++++--------- src/ydata_profiling/model/summarizer.py | 5 +- .../report/structure/__init__.py | 30 ----- .../report/structure/report.py | 2 +- .../report/structure/variables/__init__.py | 23 ++++ src/ydata_profiling/utils/backend.py | 2 +- 6 files changed, 90 insertions(+), 95 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index aa36a811c..bcca12a1c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,60 +1,63 @@ -""" - Auxiliary handler methods for data summary extraction -""" -from typing import Any, Callable, Dict, List, Sequence - -import networkx as nx -from visions import VisionsTypeset - - -def compose(functions: Sequence[Callable]) -> Callable: - """ - Compose a sequence of functions. - - :param functions: sequence of functions - :return: combined function applying all functions in order. - """ - - def composed_function(*args) -> List[Any]: - result = args # Start with the input arguments - for func in functions: - result = func(*result) if isinstance(result, tuple) else func(result) - return result # type: ignore - - return composed_function # type: ignore - - -class Handler: - """A generic handler - - Allows any custom mapping between data types and functions - """ - - def __init__( - self, - mapping: Dict[str, List[Callable]], - typeset: VisionsTypeset, - *args, - **kwargs - ): - self.mapping = mapping - self.typeset = typeset - self._complete_dag() - - def _complete_dag(self) -> None: - for from_type, to_type in nx.topological_sort( - nx.line_graph(self.typeset.base_graph) - ): - self.mapping[str(to_type)] = ( - self.mapping[str(from_type)] + self.mapping[str(to_type)] - ) - - def handle(self, dtype: str, *args, **kwargs) -> dict: - """ - Returns: - object: a tuple containing the config, the dataset series and the summary extracted - """ - funcs = self.mapping.get(dtype, []) - op = compose(funcs) - summary = op(*args)[-1] - return summary +""" + Auxiliary handler methods for data summary extraction +""" +from typing import Any, Callable, Dict, List, Sequence + +import networkx as nx +from visions import VisionsTypeset + + +def compose(functions: Sequence[Callable]) -> Callable: + """ + Compose a sequence of functions. + + :param functions: sequence of functions + :return: combined function applying all functions in order. + """ + + def composed_function(*args) -> List[Any]: + result = args # Start with the input arguments + for func in functions: + result = func(*result) if isinstance(result, tuple) else func(result) + return result # type: ignore + + return composed_function # type: ignore + + +class Handler: + """A generic handler + + Allows any custom mapping between data types and functions + """ + + def __init__( + self, + mapping: Dict[str, List[Callable]], + typeset: VisionsTypeset, + *args, + **kwargs + ): + self.mapping = mapping + self.typeset = typeset + self._complete_dag() + + def _complete_dag(self) -> None: + for from_type, to_type in nx.topological_sort( + nx.line_graph(self.typeset.base_graph) + ): + self.mapping[str(to_type)] = ( + self.mapping[str(from_type)] + self.mapping[str(to_type)] + ) + + def handle(self, dtype: str, *args, **kwargs) -> dict: + """ + Returns: + object: a tuple containing the config, the dataset series and the summary extracted + """ + funcs = self.mapping.get(dtype, []) + op = compose(funcs) + summary = op(*args)[-1] + return summary + + + diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..54d839915 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -27,7 +27,7 @@ from ydata_profiling.model.pandas.describe_supported_pandas import ( pandas_describe_supported, ) -from ydata_profiling.model.summary_algorithms import ( # Check what is this method used for +from ydata_profiling.model.summary_algorithms import ( describe_file_1d, describe_image_1d, describe_path_1d, @@ -50,9 +50,8 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """A summarizer supporting both Pandas and Spark DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() diff --git a/src/ydata_profiling/report/structure/__init__.py b/src/ydata_profiling/report/structure/__init__.py index 7ba9c10c9..8324d248d 100644 --- a/src/ydata_profiling/report/structure/__init__.py +++ b/src/ydata_profiling/report/structure/__init__.py @@ -1,31 +1 @@ """Data structure for the report""" -from typing import Callable, Dict - - -def get_render_map() -> Dict[str, Callable]: - """Get the mapping of variable types to their render functions. - - This function was moved from model.handler to report.structure to eliminate - the reverse dependency from model layer to report layer. - - Returns: - Dictionary mapping type names to render functions. - """ - import ydata_profiling.report.structure.variables as render_algorithms - - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index b64a41aae..0f027f23f 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -7,7 +7,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.report.structure import get_render_map +from ydata_profiling.report.structure.variables import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, diff --git a/src/ydata_profiling/report/structure/variables/__init__.py b/src/ydata_profiling/report/structure/variables/__init__.py index 64f1d6d54..a8aa301b5 100644 --- a/src/ydata_profiling/report/structure/variables/__init__.py +++ b/src/ydata_profiling/report/structure/variables/__init__.py @@ -1,3 +1,5 @@ +from typing import Callable, Dict + from ydata_profiling.report.structure.variables.render_boolean import render_boolean from ydata_profiling.report.structure.variables.render_categorical import ( render_categorical, @@ -17,6 +19,26 @@ ) from ydata_profiling.report.structure.variables.render_url import render_url + +def get_render_map() -> Dict[str, Callable]: + render_map = { + "Boolean": render_boolean, + "Numeric": render_real, + "Complex": render_complex, + "Text": render_text, + "DateTime": render_date, + "Categorical": render_categorical, + "URL": render_url, + "Path": render_path, + "File": render_file, + "Image": render_image, + "Unsupported": render_generic, + "TimeSeries": render_timeseries, + } + + return render_map + + __all__ = [ "render_boolean", "render_categorical", @@ -32,4 +54,5 @@ "render_text", "render_timeseries", "render_url", + "get_render_map", ] diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py index dd12f9fd3..e99d91c11 100644 --- a/src/ydata_profiling/utils/backend.py +++ b/src/ydata_profiling/utils/backend.py @@ -1,5 +1,5 @@ """ -Backend detection utilities for pandas and spark. + File with a function to check the backend being used """ import importlib From 1e2fa10eaf7a951acea663fa270784244ac18404 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 19:27:37 +0800 Subject: [PATCH 5/5] feat: initial release --- src/ydata_profiling/model/handler.py | 18 +++++++++++++++ src/ydata_profiling/model/summarizer.py | 5 ++-- .../report/structure/report.py | 2 +- .../report/structure/variables/__init__.py | 23 ------------------- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index bcca12a1c..992c1840c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -60,4 +60,22 @@ def handle(self, dtype: str, *args, **kwargs) -> dict: return summary +def get_render_map() -> Dict[str, Callable]: + import ydata_profiling.report.structure.variables as render_algorithms + render_map = { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + return render_map diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index 54d839915..d733a7d36 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -27,7 +27,7 @@ from ydata_profiling.model.pandas.describe_supported_pandas import ( pandas_describe_supported, ) -from ydata_profiling.model.summary_algorithms import ( +from ydata_profiling.model.summary_algorithms import ( # Check what is this method used for describe_file_1d, describe_image_1d, describe_path_1d, @@ -50,8 +50,9 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) +# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer supporting both Pandas and Spark DataFrames.""" + """A summarizer for Pandas DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 0f027f23f..482b410b2 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -7,7 +7,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.report.structure.variables import get_render_map +from ydata_profiling.model.handler import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, diff --git a/src/ydata_profiling/report/structure/variables/__init__.py b/src/ydata_profiling/report/structure/variables/__init__.py index a8aa301b5..64f1d6d54 100644 --- a/src/ydata_profiling/report/structure/variables/__init__.py +++ b/src/ydata_profiling/report/structure/variables/__init__.py @@ -1,5 +1,3 @@ -from typing import Callable, Dict - from ydata_profiling.report.structure.variables.render_boolean import render_boolean from ydata_profiling.report.structure.variables.render_categorical import ( render_categorical, @@ -19,26 +17,6 @@ ) from ydata_profiling.report.structure.variables.render_url import render_url - -def get_render_map() -> Dict[str, Callable]: - render_map = { - "Boolean": render_boolean, - "Numeric": render_real, - "Complex": render_complex, - "Text": render_text, - "DateTime": render_date, - "Categorical": render_categorical, - "URL": render_url, - "Path": render_path, - "File": render_file, - "Image": render_image, - "Unsupported": render_generic, - "TimeSeries": render_timeseries, - } - - return render_map - - __all__ = [ "render_boolean", "render_categorical", @@ -54,5 +32,4 @@ def get_render_map() -> Dict[str, Callable]: "render_text", "render_timeseries", "render_url", - "get_render_map", ]