From c39e0b3407a4cf6dccc7c28314733b3191388ab7 Mon Sep 17 00:00:00 2001
From: Pkcha <pkcha@PkchadeMacBook-Air.local>
Date: Sun, 12 Apr 2026 15:59:52 +0800
Subject: [PATCH] feat: initial release

---
 Dockerfile                                    |  21 +++
 src/ydata_profiling/config.py                 | 142 ++++++++----------
 src/ydata_profiling/model/handler.py          |  20 +--
 src/ydata_profiling/model/summarizer.py       |  31 ++--
 src/ydata_profiling/profile_report.py         |   6 +-
 .../report/structure/__init__.py              |  22 +++
 .../report/structure/report.py                |   2 +-
 7 files changed, 130 insertions(+), 114 deletions(-)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 000000000..7bb15bf5d
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.10-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY . .
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \
+    pip install --no-cache-dir . && \
+    pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \
+    pip install --no-cache-dir jupyter
+
+EXPOSE 8888
+
+CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]
+
+
diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py
index 09dbecdde..2bb934ed1 100644
--- a/src/ydata_profiling/config.py
+++ b/src/ydata_profiling/config.py
@@ -6,24 +6,7 @@
 import yaml
 from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr
 
-
-def _merge_dictionaries(dict1: dict, dict2: dict) -> dict:
-    """
-    Recursive merge dictionaries.
-
-    :param dict1: Base dictionary to merge.
-    :param dict2: Dictionary to merge on top of base dictionary.
-    :return: Merged dictionary
-    """
-    for key, val in dict1.items():
-        if isinstance(val, dict):
-            dict2_node = dict2.setdefault(key, {})
-            _merge_dictionaries(val, dict2_node)
-        else:
-            if key not in dict2:
-                dict2[key] = val
-
-    return dict2
+from ydata_profiling.utils.common import update
 
 
 class Dataset(BaseModel):
@@ -355,60 +338,7 @@ class Config:
     html: Html = Html()
     notebook: Notebook = Notebook()
 
-    def update(self, updates: dict) -> "Settings":
-        update = _merge_dictionaries(self.dict(), updates)
-        return self.parse_obj(self.copy(update=update))
-
-    @staticmethod
-    def from_file(config_file: Union[Path, str]) -> "Settings":
-        """Create a Settings object from a yaml file.
-
-        Args:
-            config_file: yaml file path
-        Returns:
-            Settings
-        """
-        with open(config_file) as f:
-            data = yaml.safe_load(f)
-
-        return Settings.parse_obj(data)
-
-
-class SparkSettings(Settings):
-    """
-    Setting class with the standard report configuration for Spark DataFrames
-    All the supported analysis are set to true
-    """
-
-    vars: Univariate = Univariate()
-
-    vars.num.low_categorical_threshold = 0
-
-    infer_dtypes: bool = False
-
-    correlations: Dict[str, Correlation] = {
-        "spearman": Correlation(key="spearman", calculate=True),
-        "pearson": Correlation(key="pearson", calculate=True),
-    }
-
-    correlation_table: bool = True
-
-    interactions: Interactions = Interactions()
-    interactions.continuous = False
-
-    missing_diagrams: Dict[str, bool] = {
-        "bar": False,
-        "matrix": False,
-        "dendrogram": False,
-        "heatmap": False,
-    }
-    samples: Samples = Samples()
-    samples.tail = 0
-    samples.random = 0
-
-
-class Config:
-    arg_groups: Dict[str, Any] = {
+    _arg_groups: Dict[str, Any] = {
         "sensitive": {
             "samples": None,
             "duplicates": None,
@@ -475,8 +405,8 @@ class Config:
 
     @staticmethod
     def get_arg_groups(key: str) -> dict:
-        kwargs = Config.arg_groups[key]
-        shorthand_args, _ = Config.shorthands(kwargs, split=False)
+        kwargs = Settings._arg_groups[key]
+        shorthand_args, _ = Settings.shorthands(kwargs, split=False)
         return shorthand_args
 
     @staticmethod
@@ -485,8 +415,8 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]:
         if not split:
             shorthand_args = kwargs
         for key, value in list(kwargs.items()):
-            if value is None and key in Config._shorthands:
-                shorthand_args[key] = Config._shorthands[key]
+            if value is None and key in Settings._shorthands:
+                shorthand_args[key] = Settings._shorthands[key]
                 if split:
                     del kwargs[key]
 
@@ -494,3 +424,63 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]:
             return shorthand_args, kwargs
         else:
             return shorthand_args, {}
+
+    def update(self, updates: dict) -> "Settings":
+        merged = update(self.dict().copy(), updates)
+        return self.parse_obj(self.copy(update=merged))
+
+    @staticmethod
+    def from_file(config_file: Union[Path, str]) -> "Settings":
+        """Create a Settings object from a yaml file.
+
+        Args:
+            config_file: yaml file path
+        Returns:
+            Settings
+        """
+        with open(config_file) as f:
+            data = yaml.safe_load(f)
+
+        return Settings.parse_obj(data)
+
+
+class SparkSettings(Settings):
+    """
+    Setting class with the standard report configuration for Spark DataFrames
+    All the supported analysis are set to true
+    """
+
+    vars: Univariate = Univariate()
+
+    vars.num.low_categorical_threshold = 0
+
+    infer_dtypes: bool = False
+
+    correlations: Dict[str, Correlation] = {
+        "spearman": Correlation(key="spearman", calculate=True),
+        "pearson": Correlation(key="pearson", calculate=True),
+    }
+
+    correlation_table: bool = True
+
+    interactions: Interactions = Interactions()
+    interactions.continuous = False
+
+    missing_diagrams: Dict[str, bool] = {
+        "bar": False,
+        "matrix": False,
+        "dendrogram": False,
+        "heatmap": False,
+    }
+    samples: Samples = Samples()
+    samples.tail = 0
+    samples.random = 0
+
+
+class Config(Settings):
+    """
+    Deprecated: Use Settings instead.
+    Backward compatibility alias for Settings class.
+    """
+
+    pass
diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py
index 992c1840c..e983ce2a1 100644
--- a/src/ydata_profiling/model/handler.py
+++ b/src/ydata_profiling/model/handler.py
@@ -60,22 +60,6 @@ def handle(self, dtype: str, *args, **kwargs) -> dict:
         return summary
 
 
-def get_render_map() -> Dict[str, Callable]:
-    import ydata_profiling.report.structure.variables as render_algorithms
+from ydata_profiling.report.structure import get_render_map
 
-    render_map = {
-        "Boolean": render_algorithms.render_boolean,
-        "Numeric": render_algorithms.render_real,
-        "Complex": render_algorithms.render_complex,
-        "Text": render_algorithms.render_text,
-        "DateTime": render_algorithms.render_date,
-        "Categorical": render_algorithms.render_categorical,
-        "URL": render_algorithms.render_url,
-        "Path": render_algorithms.render_path,
-        "File": render_algorithms.render_file,
-        "Image": render_algorithms.render_image,
-        "Unsupported": render_algorithms.render_generic,
-        "TimeSeries": render_algorithms.render_timeseries,
-    }
-
-    return render_map
+__all__ = ["compose", "Handler", "get_render_map"]
diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py
index d733a7d36..a57ed1c97 100644
--- a/src/ydata_profiling/model/summarizer.py
+++ b/src/ydata_profiling/model/summarizer.py
@@ -50,9 +50,8 @@ def summarize(
         return self.handle(str(dtype), config, series, {"type": str(dtype)})
 
 
-# Revisit this with the correct support for Spark as well.
 class ProfilingSummarizer(BaseSummarizer):
-    """A summarizer for Pandas DataFrames."""
+    """A summarizer supporting both Pandas and Spark DataFrames."""
 
     def __init__(self, typeset: VisionsTypeset, use_spark: bool = False):
         self.use_spark = use_spark and is_pyspark_installed()
@@ -65,7 +64,15 @@ def summary_map(self) -> Dict[str, List[Callable]]:
         return self._summary_map
 
     def _create_summary_map(self) -> Dict[str, List[Callable]]:
-        """Creates the summary map for Pandas summarization."""
+        """Creates the summary map based on the backend."""
+        common_map = {
+            "URL": [describe_url_1d],
+            "Path": [describe_path_1d],
+            "File": [describe_file_1d],
+            "Image": [describe_image_1d],
+            "TimeSeries": [describe_timeseries_1d],
+        }
+
         if self.use_spark:
             from ydata_profiling.model.spark import (
                 describe_boolean_1d_spark,
@@ -78,7 +85,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]:
                 describe_text_1d_spark,
             )
 
-            summary_map = {
+            base_map = {
                 "Unsupported": [
                     describe_counts_spark,
                     describe_generic_spark,
@@ -89,14 +96,9 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]:
                 "Text": [describe_text_1d_spark],
                 "Categorical": [describe_categorical_1d_spark],
                 "Boolean": [describe_boolean_1d_spark],
-                "URL": [describe_url_1d],
-                "Path": [describe_path_1d],
-                "File": [describe_file_1d],
-                "Image": [describe_image_1d],
-                "TimeSeries": [describe_timeseries_1d],
             }
         else:
-            summary_map = {
+            base_map = {
                 "Unsupported": [
                     pandas_describe_counts,
                     pandas_describe_generic,
@@ -107,13 +109,10 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]:
                 "Text": [pandas_describe_text_1d],
                 "Categorical": [pandas_describe_categorical_1d],
                 "Boolean": [pandas_describe_boolean_1d],
-                "URL": [pandas_describe_url_1d],
-                "Path": [pandas_describe_path_1d],
-                "File": [pandas_describe_file_1d],
-                "Image": [pandas_describe_image_1d],
-                "TimeSeries": [pandas_describe_timeseries_1d],
             }
-        return summary_map
+
+        base_map.update(common_map)
+        return base_map
 
 
 def format_summary(summary: Union[BaseDescription, dict]) -> dict:
diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py
index a7d6d9134..916b4681e 100644
--- a/src/ydata_profiling/profile_report.py
+++ b/src/ydata_profiling/profile_report.py
@@ -25,7 +25,7 @@
 from typeguard import typechecked
 from visions import VisionsTypeset
 
-from ydata_profiling.config import Config, Settings, SparkSettings
+from ydata_profiling.config import Settings, SparkSettings
 from ydata_profiling.expectations_report import ExpectationsReport
 from ydata_profiling.model import BaseDescription
 from ydata_profiling.model.alerts import AlertType
@@ -132,11 +132,11 @@ def __init__(
             cfg = Settings()
             for condition, key in groups:
                 if condition:
-                    cfg = cfg.update(Config.get_arg_groups(key))
+                    cfg = cfg.update(Settings.get_arg_groups(key))
             report_config = report_config.update(cfg.dict(exclude_defaults=True))
 
         if len(kwargs) > 0:
-            shorthands, kwargs = Config.shorthands(kwargs)
+            shorthands, kwargs = Settings.shorthands(kwargs)
             report_config = report_config.update(
                 Settings().update(shorthands).dict(exclude_defaults=True)
             )
diff --git a/src/ydata_profiling/report/structure/__init__.py b/src/ydata_profiling/report/structure/__init__.py
index 8324d248d..a2efd029a 100644
--- a/src/ydata_profiling/report/structure/__init__.py
+++ b/src/ydata_profiling/report/structure/__init__.py
@@ -1 +1,23 @@
 """Data structure for the report"""
+from typing import Callable, Dict
+
+
+def get_render_map() -> Dict[str, Callable]:
+    import ydata_profiling.report.structure.variables as render_algorithms
+
+    render_map = {
+        "Boolean": render_algorithms.render_boolean,
+        "Numeric": render_algorithms.render_real,
+        "Complex": render_algorithms.render_complex,
+        "Text": render_algorithms.render_text,
+        "DateTime": render_algorithms.render_date,
+        "Categorical": render_algorithms.render_categorical,
+        "URL": render_algorithms.render_url,
+        "Path": render_algorithms.render_path,
+        "File": render_algorithms.render_file,
+        "Image": render_algorithms.render_image,
+        "Unsupported": render_algorithms.render_generic,
+        "TimeSeries": render_algorithms.render_timeseries,
+    }
+
+    return render_map
diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py
index 482b410b2..b64a41aae 100644
--- a/src/ydata_profiling/report/structure/report.py
+++ b/src/ydata_profiling/report/structure/report.py
@@ -7,7 +7,7 @@
 from ydata_profiling.config import Settings
 from ydata_profiling.model import BaseDescription
 from ydata_profiling.model.alerts import AlertType
-from ydata_profiling.model.handler import get_render_map
+from ydata_profiling.report.structure import get_render_map
 from ydata_profiling.report.presentation.core import (
     HTML,
     Collapse,