From c39e0b3407a4cf6dccc7c28314733b3191388ab7 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 15:59:52 +0800 Subject: [PATCH] feat: initial release --- Dockerfile | 21 +++ src/ydata_profiling/config.py | 142 ++++++++---------- src/ydata_profiling/model/handler.py | 20 +-- src/ydata_profiling/model/summarizer.py | 31 ++-- src/ydata_profiling/profile_report.py | 6 +- .../report/structure/__init__.py | 22 +++ .../report/structure/report.py | 2 +- 7 files changed, 130 insertions(+), 114 deletions(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..7bb15bf5d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \ + pip install --no-cache-dir . && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \ + pip install --no-cache-dir jupyter + +EXPOSE 8888 + +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"] + + diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 09dbecdde..2bb934ed1 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -6,24 +6,7 @@ import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr - -def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: - """ - Recursive merge dictionaries. - - :param dict1: Base dictionary to merge. - :param dict2: Dictionary to merge on top of base dictionary. - :return: Merged dictionary - """ - for key, val in dict1.items(): - if isinstance(val, dict): - dict2_node = dict2.setdefault(key, {}) - _merge_dictionaries(val, dict2_node) - else: - if key not in dict2: - dict2[key] = val - - return dict2 +from ydata_profiling.utils.common import update class Dataset(BaseModel): @@ -355,60 +338,7 @@ class Config: html: Html = Html() notebook: Notebook = Notebook() - def update(self, updates: dict) -> "Settings": - update = _merge_dictionaries(self.dict(), updates) - return self.parse_obj(self.copy(update=update)) - - @staticmethod - def from_file(config_file: Union[Path, str]) -> "Settings": - """Create a Settings object from a yaml file. - - Args: - config_file: yaml file path - Returns: - Settings - """ - with open(config_file) as f: - data = yaml.safe_load(f) - - return Settings.parse_obj(data) - - -class SparkSettings(Settings): - """ - Setting class with the standard report configuration for Spark DataFrames - All the supported analysis are set to true - """ - - vars: Univariate = Univariate() - - vars.num.low_categorical_threshold = 0 - - infer_dtypes: bool = False - - correlations: Dict[str, Correlation] = { - "spearman": Correlation(key="spearman", calculate=True), - "pearson": Correlation(key="pearson", calculate=True), - } - - correlation_table: bool = True - - interactions: Interactions = Interactions() - interactions.continuous = False - - missing_diagrams: Dict[str, bool] = { - "bar": False, - "matrix": False, - "dendrogram": False, - "heatmap": False, - } - samples: Samples = Samples() - samples.tail = 0 - samples.random = 0 - - -class Config: - arg_groups: Dict[str, Any] = { + _arg_groups: Dict[str, Any] = { "sensitive": { "samples": None, "duplicates": None, @@ -475,8 +405,8 @@ class Config: @staticmethod def get_arg_groups(key: str) -> dict: - kwargs = Config.arg_groups[key] - shorthand_args, _ = Config.shorthands(kwargs, split=False) + kwargs = Settings._arg_groups[key] + shorthand_args, _ = Settings.shorthands(kwargs, split=False) return shorthand_args @staticmethod @@ -485,8 +415,8 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: if not split: shorthand_args = kwargs for key, value in list(kwargs.items()): - if value is None and key in Config._shorthands: - shorthand_args[key] = Config._shorthands[key] + if value is None and key in Settings._shorthands: + shorthand_args[key] = Settings._shorthands[key] if split: del kwargs[key] @@ -494,3 +424,63 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: return shorthand_args, kwargs else: return shorthand_args, {} + + def update(self, updates: dict) -> "Settings": + merged = update(self.dict().copy(), updates) + return self.parse_obj(self.copy(update=merged)) + + @staticmethod + def from_file(config_file: Union[Path, str]) -> "Settings": + """Create a Settings object from a yaml file. + + Args: + config_file: yaml file path + Returns: + Settings + """ + with open(config_file) as f: + data = yaml.safe_load(f) + + return Settings.parse_obj(data) + + +class SparkSettings(Settings): + """ + Setting class with the standard report configuration for Spark DataFrames + All the supported analysis are set to true + """ + + vars: Univariate = Univariate() + + vars.num.low_categorical_threshold = 0 + + infer_dtypes: bool = False + + correlations: Dict[str, Correlation] = { + "spearman": Correlation(key="spearman", calculate=True), + "pearson": Correlation(key="pearson", calculate=True), + } + + correlation_table: bool = True + + interactions: Interactions = Interactions() + interactions.continuous = False + + missing_diagrams: Dict[str, bool] = { + "bar": False, + "matrix": False, + "dendrogram": False, + "heatmap": False, + } + samples: Samples = Samples() + samples.tail = 0 + samples.random = 0 + + +class Config(Settings): + """ + Deprecated: Use Settings instead. + Backward compatibility alias for Settings class. + """ + + pass diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 992c1840c..e983ce2a1 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -60,22 +60,6 @@ def handle(self, dtype: str, *args, **kwargs) -> dict: return summary -def get_render_map() -> Dict[str, Callable]: - import ydata_profiling.report.structure.variables as render_algorithms +from ydata_profiling.report.structure import get_render_map - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map +__all__ = ["compose", "Handler", "get_render_map"] diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..a57ed1c97 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,9 +50,8 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """A summarizer supporting both Pandas and Spark DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() @@ -65,7 +64,15 @@ def summary_map(self) -> Dict[str, List[Callable]]: return self._summary_map def _create_summary_map(self) -> Dict[str, List[Callable]]: - """Creates the summary map for Pandas summarization.""" + """Creates the summary map based on the backend.""" + common_map = { + "URL": [describe_url_1d], + "Path": [describe_path_1d], + "File": [describe_file_1d], + "Image": [describe_image_1d], + "TimeSeries": [describe_timeseries_1d], + } + if self.use_spark: from ydata_profiling.model.spark import ( describe_boolean_1d_spark, @@ -78,7 +85,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: describe_text_1d_spark, ) - summary_map = { + base_map = { "Unsupported": [ describe_counts_spark, describe_generic_spark, @@ -89,14 +96,9 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [describe_text_1d_spark], "Categorical": [describe_categorical_1d_spark], "Boolean": [describe_boolean_1d_spark], - "URL": [describe_url_1d], - "Path": [describe_path_1d], - "File": [describe_file_1d], - "Image": [describe_image_1d], - "TimeSeries": [describe_timeseries_1d], } else: - summary_map = { + base_map = { "Unsupported": [ pandas_describe_counts, pandas_describe_generic, @@ -107,13 +109,10 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [pandas_describe_text_1d], "Categorical": [pandas_describe_categorical_1d], "Boolean": [pandas_describe_boolean_1d], - "URL": [pandas_describe_url_1d], - "Path": [pandas_describe_path_1d], - "File": [pandas_describe_file_1d], - "Image": [pandas_describe_image_1d], - "TimeSeries": [pandas_describe_timeseries_1d], } - return summary_map + + base_map.update(common_map) + return base_map def format_summary(summary: Union[BaseDescription, dict]) -> dict: diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index a7d6d9134..916b4681e 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -25,7 +25,7 @@ from typeguard import typechecked from visions import VisionsTypeset -from ydata_profiling.config import Config, Settings, SparkSettings +from ydata_profiling.config import Settings, SparkSettings from ydata_profiling.expectations_report import ExpectationsReport from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType @@ -132,11 +132,11 @@ def __init__( cfg = Settings() for condition, key in groups: if condition: - cfg = cfg.update(Config.get_arg_groups(key)) + cfg = cfg.update(Settings.get_arg_groups(key)) report_config = report_config.update(cfg.dict(exclude_defaults=True)) if len(kwargs) > 0: - shorthands, kwargs = Config.shorthands(kwargs) + shorthands, kwargs = Settings.shorthands(kwargs) report_config = report_config.update( Settings().update(shorthands).dict(exclude_defaults=True) ) diff --git a/src/ydata_profiling/report/structure/__init__.py b/src/ydata_profiling/report/structure/__init__.py index 8324d248d..a2efd029a 100644 --- a/src/ydata_profiling/report/structure/__init__.py +++ b/src/ydata_profiling/report/structure/__init__.py @@ -1 +1,23 @@ """Data structure for the report""" +from typing import Callable, Dict + + +def get_render_map() -> Dict[str, Callable]: + import ydata_profiling.report.structure.variables as render_algorithms + + render_map = { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + return render_map diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 482b410b2..b64a41aae 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -7,7 +7,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.model.handler import get_render_map +from ydata_profiling.report.structure import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse,