From 7775fd0f4db3102b1c02d96db697935702ce71f8 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sat, 11 Apr 2026 18:36:51 +0800 Subject: [PATCH 1/2] feat: initial release --- src/ydata_profiling/config.py | 40 +++- src/ydata_profiling/model/describe.py | 52 +++-- src/ydata_profiling/model/handler.py | 41 ++-- src/ydata_profiling/model/summarizer.py | 192 ++++++++++-------- .../report/structure/report.py | 27 ++- venv/Dockerfile | 19 ++ 6 files changed, 237 insertions(+), 134 deletions(-) create mode 100644 venv/Dockerfile diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 09dbecdde..ed45110d0 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -407,8 +407,15 @@ class SparkSettings(Settings): samples.random = 0 -class Config: - arg_groups: Dict[str, Any] = { +class _Config: + """Container for configuration presets and shorthand mappings. + + This class provides predefined configuration groups (sensitive, explorative, themes) + and shorthand mappings for common configuration options. It should be used only + through its static methods. + """ + + arg_groups = { "sensitive": { "samples": None, "duplicates": None, @@ -475,18 +482,36 @@ class Config: @staticmethod def get_arg_groups(key: str) -> dict: - kwargs = Config.arg_groups[key] - shorthand_args, _ = Config.shorthands(kwargs, split=False) + """Get expanded configuration for a preset group. + + Args: + key: Name of preset group (e.g., "sensitive", "explorative") + + Returns: + Expanded configuration dictionary with shorthands resolved + """ + kwargs = _Config.arg_groups[key] + shorthand_args, _ = _Config.shorthands(kwargs, split=False) return shorthand_args @staticmethod def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: + """Expand shorthand configuration keys. + + Args: + kwargs: Configuration dictionary potentially containing shorthands + split: If True, remove shorthands from kwargs and return separately. + If False, expand shorthands in-place within kwargs. + + Returns: + Tuple of (shorthand_args, remaining_kwargs) + """ shorthand_args = {} if not split: shorthand_args = kwargs for key, value in list(kwargs.items()): - if value is None and key in Config._shorthands: - shorthand_args[key] = Config._shorthands[key] + if value is None and key in _Config._shorthands: + shorthand_args[key] = _Config._shorthands[key] if split: del kwargs[key] @@ -494,3 +519,6 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: return shorthand_args, kwargs else: return shorthand_args, {} + + +Config = _Config diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py index 74bdf924a..1aac3747f 100644 --- a/src/ydata_profiling/model/describe.py +++ b/src/ydata_profiling/model/describe.py @@ -27,6 +27,37 @@ from ydata_profiling.version import __version__ +def _validate_inputs( + config: Settings, df: Union[pd.DataFrame, "pyspark.sql.DataFrame"] # type: ignore[name-defined] # noqa: F821 +) -> None: + """Validate input types for profiling. + + Args: + config: Report configuration settings + df: DataFrame to profile + + Raises: + TypeError: If inputs are of incorrect type + """ + if not isinstance(config, Settings): + raise TypeError(f"`config` must be of type `Settings`, got {type(config)}") + + if isinstance(df, pd.DataFrame): + return + + try: + from pyspark.sql import DataFrame as SparkDataFrame + if isinstance(df, SparkDataFrame): + return + except ImportError: + pass + + raise TypeError( + f"`df` must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}." + f"If using Spark, make sure PySpark is installed." + ) + + def describe( config: Settings, df: Union[pd.DataFrame, "pyspark.sql.DataFrame"], # type: ignore[name-defined] # noqa: F821 @@ -52,26 +83,7 @@ def describe( - alerts: direct special attention to these patterns in your data. - package: package details. """ - # ** Validate Input types ** - if not isinstance(config, Settings): - raise TypeError(f"`config` must be of type `Settings`, got {type(config)}") - - # Validate df input type - - if not isinstance(df, pd.DataFrame): - try: - from pyspark.sql import DataFrame as SparkDataFrame # type: ignore - - if not isinstance(df, SparkDataFrame): # noqa: TC301 - raise TypeError( # noqa: TC301 - f"`df` must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}." - ) - except ImportError as ex: - raise TypeError( - f"`df must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}." - f"If using Spark, make sure PySpark is installed." - ) from ex - + _validate_inputs(config, df) df = preprocess(config, df) number_of_tasks = 5 diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 992c1840c..91d135973 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -25,9 +25,10 @@ def composed_function(*args) -> List[Any]: class Handler: - """A generic handler + """Generic handler for data type specific processing pipelines. - Allows any custom mapping between data types and functions + Builds a processing pipeline for each data type by composing functions + along the type hierarchy. Allows custom summarization strategies. """ def __init__( @@ -42,6 +43,11 @@ def __init__( self._complete_dag() def _complete_dag(self) -> None: + """Propagate functions along the type hierarchy DAG. + + Functions defined for parent types are inherited by subtypes, + creating a complete processing pipeline for each type. + """ for from_type, to_type in nx.topological_sort( nx.line_graph(self.typeset.base_graph) ): @@ -50,32 +56,17 @@ def _complete_dag(self) -> None: ) def handle(self, dtype: str, *args, **kwargs) -> dict: - """ + """Execute the processing pipeline for a given data type. + + Args: + dtype: Name of the data type to process + *args: Arguments passed to the processing pipeline + **kwargs: Additional keyword arguments + Returns: - object: a tuple containing the config, the dataset series and the summary extracted + Extracted summary dictionary """ funcs = self.mapping.get(dtype, []) op = compose(funcs) summary = op(*args)[-1] return summary - - -def get_render_map() -> Dict[str, Callable]: - import ydata_profiling.report.structure.variables as render_algorithms - - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..a3665f42b 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -10,53 +10,134 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.handler import Handler -from ydata_profiling.model.pandas import ( - pandas_describe_boolean_1d, - pandas_describe_categorical_1d, - pandas_describe_counts, - pandas_describe_date_1d, - pandas_describe_file_1d, - pandas_describe_generic, - pandas_describe_image_1d, - pandas_describe_numeric_1d, - pandas_describe_path_1d, - pandas_describe_text_1d, - pandas_describe_timeseries_1d, - pandas_describe_url_1d, -) -from ydata_profiling.model.pandas.describe_supported_pandas import ( - pandas_describe_supported, -) -from ydata_profiling.model.summary_algorithms import ( # Check what is this method used for - describe_file_1d, - describe_image_1d, - describe_path_1d, - describe_timeseries_1d, - describe_url_1d, -) from ydata_profiling.utils.backend import is_pyspark_installed class BaseSummarizer(Handler): - """A base summarizer + """Base class for data summarization. - Can be used to define custom summarizations + Provides a flexible framework to define custom summarization strategies + for different data types and dataframe backends. """ def summarize( self, config: Settings, series: pd.Series, dtype: Type[VisionsBaseType] ) -> dict: - """Generates the summary for a given series""" + """Generates the summary statistics for a given series. + + Args: + config: Report configuration settings + series: Data series to summarize + dtype: Detected data type from visions typeset + + Returns: + Dictionary containing summary statistics + """ return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. +def _create_pandas_summary_map() -> Dict[str, List[Callable]]: + """Create summary function mapping for Pandas backend.""" + from ydata_profiling.model.pandas import ( + pandas_describe_boolean_1d, + pandas_describe_categorical_1d, + pandas_describe_counts, + pandas_describe_date_1d, + pandas_describe_file_1d, + pandas_describe_generic, + pandas_describe_image_1d, + pandas_describe_numeric_1d, + pandas_describe_path_1d, + pandas_describe_text_1d, + pandas_describe_timeseries_1d, + pandas_describe_url_1d, + ) + from ydata_profiling.model.pandas.describe_supported_pandas import ( + pandas_describe_supported, + ) + + return { + "Unsupported": [ + pandas_describe_counts, + pandas_describe_generic, + pandas_describe_supported, + ], + "Numeric": [pandas_describe_numeric_1d], + "DateTime": [pandas_describe_date_1d], + "Text": [pandas_describe_text_1d], + "Categorical": [pandas_describe_categorical_1d], + "Boolean": [pandas_describe_boolean_1d], + "URL": [pandas_describe_url_1d], + "Path": [pandas_describe_path_1d], + "File": [pandas_describe_file_1d], + "Image": [pandas_describe_image_1d], + "TimeSeries": [pandas_describe_timeseries_1d], + } + + +def _create_spark_summary_map() -> Dict[str, List[Callable]]: + """Create summary function mapping for Spark backend.""" + from ydata_profiling.model.spark import ( + describe_boolean_1d_spark, + describe_categorical_1d_spark, + describe_counts_spark, + describe_date_1d_spark, + describe_generic_spark, + describe_numeric_1d_spark, + describe_supported_spark, + describe_text_1d_spark, + ) + from ydata_profiling.model.summary_algorithms import ( + describe_file_1d, + describe_image_1d, + describe_path_1d, + describe_timeseries_1d, + describe_url_1d, + ) + + return { + "Unsupported": [ + describe_counts_spark, + describe_generic_spark, + describe_supported_spark, + ], + "Numeric": [describe_numeric_1d_spark], + "DateTime": [describe_date_1d_spark], + "Text": [describe_text_1d_spark], + "Categorical": [describe_categorical_1d_spark], + "Boolean": [describe_boolean_1d_spark], + "URL": [describe_url_1d], + "Path": [describe_path_1d], + "File": [describe_file_1d], + "Image": [describe_image_1d], + "TimeSeries": [describe_timeseries_1d], + } + + +def _create_summary_map_factory(use_spark: bool) -> Dict[str, List[Callable]]: + """Factory function to create appropriate summary map based on backend. + + Args: + use_spark: If True, create Spark-compatible summary map + + Returns: + Mapping from data types to summary functions + """ + if use_spark: + return _create_spark_summary_map() + return _create_pandas_summary_map() + + class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """Standard summarizer for data profiling. + + Supports both Pandas and Spark backends, providing comprehensive + statistical summaries for all standard data types. + """ def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() - self._summary_map = self._create_summary_map() + self._summary_map = _create_summary_map_factory(self.use_spark) super().__init__(self._summary_map, typeset) @property @@ -64,57 +145,6 @@ def summary_map(self) -> Dict[str, List[Callable]]: """Allows users to modify the summary map after initialization.""" return self._summary_map - def _create_summary_map(self) -> Dict[str, List[Callable]]: - """Creates the summary map for Pandas summarization.""" - if self.use_spark: - from ydata_profiling.model.spark import ( - describe_boolean_1d_spark, - describe_categorical_1d_spark, - describe_counts_spark, - describe_date_1d_spark, - describe_generic_spark, - describe_numeric_1d_spark, - describe_supported_spark, - describe_text_1d_spark, - ) - - summary_map = { - "Unsupported": [ - describe_counts_spark, - describe_generic_spark, - describe_supported_spark, - ], - "Numeric": [describe_numeric_1d_spark], - "DateTime": [describe_date_1d_spark], - "Text": [describe_text_1d_spark], - "Categorical": [describe_categorical_1d_spark], - "Boolean": [describe_boolean_1d_spark], - "URL": [describe_url_1d], - "Path": [describe_path_1d], - "File": [describe_file_1d], - "Image": [describe_image_1d], - "TimeSeries": [describe_timeseries_1d], - } - else: - summary_map = { - "Unsupported": [ - pandas_describe_counts, - pandas_describe_generic, - pandas_describe_supported, - ], - "Numeric": [pandas_describe_numeric_1d], - "DateTime": [pandas_describe_date_1d], - "Text": [pandas_describe_text_1d], - "Categorical": [pandas_describe_categorical_1d], - "Boolean": [pandas_describe_boolean_1d], - "URL": [pandas_describe_url_1d], - "Path": [pandas_describe_path_1d], - "File": [pandas_describe_file_1d], - "Image": [pandas_describe_image_1d], - "TimeSeries": [pandas_describe_timeseries_1d], - } - return summary_map - def format_summary(summary: Union[BaseDescription, dict]) -> dict: """Prepare summary for export to json file. diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 482b410b2..64bec9fd8 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -1,5 +1,5 @@ import os -from typing import List, Sequence +from typing import Callable, Dict, List, Sequence import pandas as pd from tqdm.auto import tqdm @@ -7,7 +7,6 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.model.handler import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, @@ -24,6 +23,30 @@ from ydata_profiling.utils.dataframe import slugify +def get_render_map() -> Dict[str, Callable]: + """Create mapping from data types to rendering functions. + + Returns: + Dictionary mapping data type names to their respective render functions + """ + import ydata_profiling.report.structure.variables as render_algorithms + + return { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + def get_missing_items(config: Settings, summary: BaseDescription) -> list: """Return the missing diagrams diff --git a/venv/Dockerfile b/venv/Dockerfile new file mode 100644 index 000000000..9e0a68801 --- /dev/null +++ b/venv/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \ + pip install --no-cache-dir . && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \ + pip install --no-cache-dir jupyter + +EXPOSE 8888 + +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"] From dbfcf1169a8067aa8e11e462038ebc4b2dd8f828 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sat, 11 Apr 2026 20:25:37 +0800 Subject: [PATCH 2/2] feat: initial release --- src/ydata_profiling/config.py | 40 +- src/ydata_profiling/model/alerts.py | 19 +- src/ydata_profiling/model/describe.py | 52 +- src/ydata_profiling/model/handler.py | 20 +- src/ydata_profiling/model/summarizer.py | 474 +++++++++--------- .../model/summary_algorithms.py | 6 - src/ydata_profiling/report/formatters.py | 99 +--- src/ydata_profiling/utils/formatters.py | 98 ++++ 8 files changed, 373 insertions(+), 435 deletions(-) create mode 100644 src/ydata_profiling/utils/formatters.py diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index ed45110d0..09dbecdde 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -407,15 +407,8 @@ class SparkSettings(Settings): samples.random = 0 -class _Config: - """Container for configuration presets and shorthand mappings. - - This class provides predefined configuration groups (sensitive, explorative, themes) - and shorthand mappings for common configuration options. It should be used only - through its static methods. - """ - - arg_groups = { +class Config: + arg_groups: Dict[str, Any] = { "sensitive": { "samples": None, "duplicates": None, @@ -482,36 +475,18 @@ class _Config: @staticmethod def get_arg_groups(key: str) -> dict: - """Get expanded configuration for a preset group. - - Args: - key: Name of preset group (e.g., "sensitive", "explorative") - - Returns: - Expanded configuration dictionary with shorthands resolved - """ - kwargs = _Config.arg_groups[key] - shorthand_args, _ = _Config.shorthands(kwargs, split=False) + kwargs = Config.arg_groups[key] + shorthand_args, _ = Config.shorthands(kwargs, split=False) return shorthand_args @staticmethod def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: - """Expand shorthand configuration keys. - - Args: - kwargs: Configuration dictionary potentially containing shorthands - split: If True, remove shorthands from kwargs and return separately. - If False, expand shorthands in-place within kwargs. - - Returns: - Tuple of (shorthand_args, remaining_kwargs) - """ shorthand_args = {} if not split: shorthand_args = kwargs for key, value in list(kwargs.items()): - if value is None and key in _Config._shorthands: - shorthand_args[key] = _Config._shorthands[key] + if value is None and key in Config._shorthands: + shorthand_args[key] = Config._shorthands[key] if split: del kwargs[key] @@ -519,6 +494,3 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: return shorthand_args, kwargs else: return shorthand_args, {} - - -Config = _Config diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index 1b16d27a0..6c0bfa193 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -9,27 +9,10 @@ from ydata_profiling.config import Settings from ydata_profiling.model.correlations import perform_check_correlation +from ydata_profiling.utils.formatters import fmt_percent from ydata_profiling.utils.styles import get_alert_styles -def fmt_percent(value: float, edge_cases: bool = True) -> str: - """Format a ratio as a percentage. - - Args: - edge_cases: Check for edge cases? - value: The ratio. - - Returns: - The percentage with 1 point precision. - """ - if edge_cases and round(value, 3) == 0 and value > 0: - return "< 0.1%" - if edge_cases and round(value, 3) == 1 and value < 1: - return "> 99.9%" - - return f"{value*100:2.1f}%" - - @unique class AlertType(Enum): """Alert types""" diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py index 1aac3747f..74bdf924a 100644 --- a/src/ydata_profiling/model/describe.py +++ b/src/ydata_profiling/model/describe.py @@ -27,37 +27,6 @@ from ydata_profiling.version import __version__ -def _validate_inputs( - config: Settings, df: Union[pd.DataFrame, "pyspark.sql.DataFrame"] # type: ignore[name-defined] # noqa: F821 -) -> None: - """Validate input types for profiling. - - Args: - config: Report configuration settings - df: DataFrame to profile - - Raises: - TypeError: If inputs are of incorrect type - """ - if not isinstance(config, Settings): - raise TypeError(f"`config` must be of type `Settings`, got {type(config)}") - - if isinstance(df, pd.DataFrame): - return - - try: - from pyspark.sql import DataFrame as SparkDataFrame - if isinstance(df, SparkDataFrame): - return - except ImportError: - pass - - raise TypeError( - f"`df` must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}." - f"If using Spark, make sure PySpark is installed." - ) - - def describe( config: Settings, df: Union[pd.DataFrame, "pyspark.sql.DataFrame"], # type: ignore[name-defined] # noqa: F821 @@ -83,7 +52,26 @@ def describe( - alerts: direct special attention to these patterns in your data. - package: package details. """ - _validate_inputs(config, df) + # ** Validate Input types ** + if not isinstance(config, Settings): + raise TypeError(f"`config` must be of type `Settings`, got {type(config)}") + + # Validate df input type + + if not isinstance(df, pd.DataFrame): + try: + from pyspark.sql import DataFrame as SparkDataFrame # type: ignore + + if not isinstance(df, SparkDataFrame): # noqa: TC301 + raise TypeError( # noqa: TC301 + f"`df` must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}." + ) + except ImportError as ex: + raise TypeError( + f"`df must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}." + f"If using Spark, make sure PySpark is installed." + ) from ex + df = preprocess(config, df) number_of_tasks = 5 diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 91d135973..d4e07418d 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -25,10 +25,9 @@ def composed_function(*args) -> List[Any]: class Handler: - """Generic handler for data type specific processing pipelines. + """A generic handler - Builds a processing pipeline for each data type by composing functions - along the type hierarchy. Allows custom summarization strategies. + Allows any custom mapping between data types and functions """ def __init__( @@ -43,11 +42,6 @@ def __init__( self._complete_dag() def _complete_dag(self) -> None: - """Propagate functions along the type hierarchy DAG. - - Functions defined for parent types are inherited by subtypes, - creating a complete processing pipeline for each type. - """ for from_type, to_type in nx.topological_sort( nx.line_graph(self.typeset.base_graph) ): @@ -56,15 +50,9 @@ def _complete_dag(self) -> None: ) def handle(self, dtype: str, *args, **kwargs) -> dict: - """Execute the processing pipeline for a given data type. - - Args: - dtype: Name of the data type to process - *args: Arguments passed to the processing pipeline - **kwargs: Additional keyword arguments - + """ Returns: - Extracted summary dictionary + object: a tuple containing the config, the dataset series and the summary extracted """ funcs = self.mapping.get(dtype, []) op = compose(funcs) diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index a3665f42b..67bd3e7bf 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -1,237 +1,237 @@ -# mypy: ignore-errors - -from dataclasses import asdict -from typing import Any, Callable, Dict, List, Type, Union - -import numpy as np -import pandas as pd -from visions import VisionsBaseType, VisionsTypeset - -from ydata_profiling.config import Settings -from ydata_profiling.model import BaseDescription -from ydata_profiling.model.handler import Handler -from ydata_profiling.utils.backend import is_pyspark_installed - - -class BaseSummarizer(Handler): - """Base class for data summarization. - - Provides a flexible framework to define custom summarization strategies - for different data types and dataframe backends. - """ - - def summarize( - self, config: Settings, series: pd.Series, dtype: Type[VisionsBaseType] - ) -> dict: - """Generates the summary statistics for a given series. - - Args: - config: Report configuration settings - series: Data series to summarize - dtype: Detected data type from visions typeset - - Returns: - Dictionary containing summary statistics - """ - return self.handle(str(dtype), config, series, {"type": str(dtype)}) - - -def _create_pandas_summary_map() -> Dict[str, List[Callable]]: - """Create summary function mapping for Pandas backend.""" - from ydata_profiling.model.pandas import ( - pandas_describe_boolean_1d, - pandas_describe_categorical_1d, - pandas_describe_counts, - pandas_describe_date_1d, - pandas_describe_file_1d, - pandas_describe_generic, - pandas_describe_image_1d, - pandas_describe_numeric_1d, - pandas_describe_path_1d, - pandas_describe_text_1d, - pandas_describe_timeseries_1d, - pandas_describe_url_1d, - ) - from ydata_profiling.model.pandas.describe_supported_pandas import ( - pandas_describe_supported, - ) - - return { - "Unsupported": [ - pandas_describe_counts, - pandas_describe_generic, - pandas_describe_supported, - ], - "Numeric": [pandas_describe_numeric_1d], - "DateTime": [pandas_describe_date_1d], - "Text": [pandas_describe_text_1d], - "Categorical": [pandas_describe_categorical_1d], - "Boolean": [pandas_describe_boolean_1d], - "URL": [pandas_describe_url_1d], - "Path": [pandas_describe_path_1d], - "File": [pandas_describe_file_1d], - "Image": [pandas_describe_image_1d], - "TimeSeries": [pandas_describe_timeseries_1d], - } - - -def _create_spark_summary_map() -> Dict[str, List[Callable]]: - """Create summary function mapping for Spark backend.""" - from ydata_profiling.model.spark import ( - describe_boolean_1d_spark, - describe_categorical_1d_spark, - describe_counts_spark, - describe_date_1d_spark, - describe_generic_spark, - describe_numeric_1d_spark, - describe_supported_spark, - describe_text_1d_spark, - ) - from ydata_profiling.model.summary_algorithms import ( - describe_file_1d, - describe_image_1d, - describe_path_1d, - describe_timeseries_1d, - describe_url_1d, - ) - - return { - "Unsupported": [ - describe_counts_spark, - describe_generic_spark, - describe_supported_spark, - ], - "Numeric": [describe_numeric_1d_spark], - "DateTime": [describe_date_1d_spark], - "Text": [describe_text_1d_spark], - "Categorical": [describe_categorical_1d_spark], - "Boolean": [describe_boolean_1d_spark], - "URL": [describe_url_1d], - "Path": [describe_path_1d], - "File": [describe_file_1d], - "Image": [describe_image_1d], - "TimeSeries": [describe_timeseries_1d], - } - - -def _create_summary_map_factory(use_spark: bool) -> Dict[str, List[Callable]]: - """Factory function to create appropriate summary map based on backend. - - Args: - use_spark: If True, create Spark-compatible summary map - - Returns: - Mapping from data types to summary functions - """ - if use_spark: - return _create_spark_summary_map() - return _create_pandas_summary_map() - - -class ProfilingSummarizer(BaseSummarizer): - """Standard summarizer for data profiling. - - Supports both Pandas and Spark backends, providing comprehensive - statistical summaries for all standard data types. - """ - - def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): - self.use_spark = use_spark and is_pyspark_installed() - self._summary_map = _create_summary_map_factory(self.use_spark) - super().__init__(self._summary_map, typeset) - - @property - def summary_map(self) -> Dict[str, List[Callable]]: - """Allows users to modify the summary map after initialization.""" - return self._summary_map - - -def format_summary(summary: Union[BaseDescription, dict]) -> dict: - """Prepare summary for export to json file. - - Args: - summary (Union[BaseDescription, dict]): summary to export - - Returns: - dict: summary as dict - """ - - def fmt(v: Any) -> Any: - if isinstance(v, dict): - return {k: fmt(va) for k, va in v.items()} - else: - if isinstance(v, pd.Series): - return fmt(v.to_dict()) - elif ( - isinstance(v, tuple) - and len(v) == 2 - and all(isinstance(x, np.ndarray) for x in v) - ): - return {"counts": v[0].tolist(), "bin_edges": v[1].tolist()} - else: - return v - - if isinstance(summary, BaseDescription): - summary = asdict(summary) - - summary = {k: fmt(v) for k, v in summary.items()} - return summary - - -def _redact_column(column: Dict[str, Any]) -> Dict[str, Any]: - def redact_key(data: Dict[str, Any]) -> Dict[str, Any]: - return {f"REDACTED_{i}": v for i, (_, v) in enumerate(data.items())} - - def redact_value(data: Dict[str, Any]) -> Dict[str, Any]: - return {k: f"REDACTED_{i}" for i, (k, _) in enumerate(data.items())} - - keys_to_redact = [ - "block_alias_char_counts", - "block_alias_values", - "category_alias_char_counts", - "category_alias_values", - "character_counts", - "script_char_counts", - "value_counts_index_sorted", - "value_counts_without_nan", - "word_counts", - ] - - values_to_redact = ["first_rows"] - - for field in keys_to_redact: - if field not in column: - continue - is_dict = (isinstance(v, dict) for v in column[field].values()) - if any(is_dict): - column[field] = {k: redact_key(v) for k, v in column[field].items()} - else: - column[field] = redact_key(column[field]) - - for field in values_to_redact: - if field not in column: - continue - is_dict = (isinstance(v, dict) for v in column[field].values()) - if any(is_dict): - column[field] = {k: redact_value(v) for k, v in column[field].items()} - else: - column[field] = redact_value(column[field]) - - return column - - -def redact_summary(summary: dict, config: Settings) -> dict: - """Redact summary to export to json file. - - Args: - summary (dict): summary to redact - - Returns: - dict: redacted summary - """ - for _, col in summary["variables"].items(): - if (config.vars.cat.redact and col["type"] == "Categorical") or ( - config.vars.text.redact and col["type"] == "Text" - ): - col = _redact_column(col) - return summary +# mypy: ignore-errors + +from dataclasses import asdict +from typing import Any, Callable, Dict, List, Type, Union + +import numpy as np +import pandas as pd +from visions import VisionsBaseType, VisionsTypeset + +from ydata_profiling.config import Settings +from ydata_profiling.model import BaseDescription +from ydata_profiling.model.handler import Handler +from ydata_profiling.utils.backend import is_pyspark_installed + + +class BaseSummarizer(Handler): + """Base class for data summarization. + + Provides a flexible framework to define custom summarization strategies + for different data types and dataframe backends. + """ + + def summarize( + self, config: Settings, series: pd.Series, dtype: Type[VisionsBaseType] + ) -> dict: + """Generates the summary statistics for a given series. + + Args: + config: Report configuration settings + series: Data series to summarize + dtype: Detected data type from visions typeset + + Returns: + Dictionary containing summary statistics + """ + return self.handle(str(dtype), config, series, {"type": str(dtype)}) + + +def _create_pandas_summary_map() -> Dict[str, List[Callable]]: + """Create summary function mapping for Pandas backend.""" + from ydata_profiling.model.pandas import ( + pandas_describe_boolean_1d, + pandas_describe_categorical_1d, + pandas_describe_counts, + pandas_describe_date_1d, + pandas_describe_file_1d, + pandas_describe_generic, + pandas_describe_image_1d, + pandas_describe_numeric_1d, + pandas_describe_path_1d, + pandas_describe_text_1d, + pandas_describe_timeseries_1d, + pandas_describe_url_1d, + ) + from ydata_profiling.model.pandas.describe_supported_pandas import ( + pandas_describe_supported, + ) + + return { + "Unsupported": [ + pandas_describe_counts, + pandas_describe_generic, + pandas_describe_supported, + ], + "Numeric": [pandas_describe_numeric_1d], + "DateTime": [pandas_describe_date_1d], + "Text": [pandas_describe_text_1d], + "Categorical": [pandas_describe_categorical_1d], + "Boolean": [pandas_describe_boolean_1d], + "URL": [pandas_describe_url_1d], + "Path": [pandas_describe_path_1d], + "File": [pandas_describe_file_1d], + "Image": [pandas_describe_image_1d], + "TimeSeries": [pandas_describe_timeseries_1d], + } + + +def _create_spark_summary_map() -> Dict[str, List[Callable]]: + """Create summary function mapping for Spark backend.""" + from ydata_profiling.model.spark import ( + describe_boolean_1d_spark, + describe_categorical_1d_spark, + describe_counts_spark, + describe_date_1d_spark, + describe_generic_spark, + describe_numeric_1d_spark, + describe_supported_spark, + describe_text_1d_spark, + ) + from ydata_profiling.model.summary_algorithms import ( + describe_file_1d, + describe_image_1d, + describe_path_1d, + describe_timeseries_1d, + describe_url_1d, + ) + + return { + "Unsupported": [ + describe_counts_spark, + describe_generic_spark, + describe_supported_spark, + ], + "Numeric": [describe_numeric_1d_spark], + "DateTime": [describe_date_1d_spark], + "Text": [describe_text_1d_spark], + "Categorical": [describe_categorical_1d_spark], + "Boolean": [describe_boolean_1d_spark], + "URL": [describe_url_1d], + "Path": [describe_path_1d], + "File": [describe_file_1d], + "Image": [describe_image_1d], + "TimeSeries": [describe_timeseries_1d], + } + + +def _create_summary_map_factory(use_spark: bool) -> Dict[str, List[Callable]]: + """Factory function to create appropriate summary map based on backend. + + Args: + use_spark: If True, create Spark-compatible summary map + + Returns: + Mapping from data types to summary functions + """ + if use_spark: + return _create_spark_summary_map() + return _create_pandas_summary_map() + + +class ProfilingSummarizer(BaseSummarizer): + """Standard summarizer for data profiling. + + Supports both Pandas and Spark backends, providing comprehensive + statistical summaries for all standard data types. + """ + + def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): + self.use_spark = use_spark and is_pyspark_installed() + self._summary_map = _create_summary_map_factory(self.use_spark) + super().__init__(self._summary_map, typeset) + + @property + def summary_map(self) -> Dict[str, List[Callable]]: + """Allows users to modify the summary map after initialization.""" + return self._summary_map + + +def format_summary(summary: Union[BaseDescription, dict]) -> dict: + """Prepare summary for export to json file. + + Args: + summary (Union[BaseDescription, dict]): summary to export + + Returns: + dict: summary as dict + """ + + def fmt(v: Any) -> Any: + if isinstance(v, dict): + return {k: fmt(va) for k, va in v.items()} + else: + if isinstance(v, pd.Series): + return fmt(v.to_dict()) + elif ( + isinstance(v, tuple) + and len(v) == 2 + and all(isinstance(x, np.ndarray) for x in v) + ): + return {"counts": v[0].tolist(), "bin_edges": v[1].tolist()} + else: + return v + + if isinstance(summary, BaseDescription): + summary = asdict(summary) + + summary = {k: fmt(v) for k, v in summary.items()} + return summary + + +def _redact_column(column: Dict[str, Any]) -> Dict[str, Any]: + def redact_key(data: Dict[str, Any]) -> Dict[str, Any]: + return {f"REDACTED_{i}": v for i, (_, v) in enumerate(data.items())} + + def redact_value(data: Dict[str, Any]) -> Dict[str, Any]: + return {k: f"REDACTED_{i}" for i, (k, _) in enumerate(data.items())} + + keys_to_redact = [ + "block_alias_char_counts", + "block_alias_values", + "category_alias_char_counts", + "category_alias_values", + "character_counts", + "script_char_counts", + "value_counts_index_sorted", + "value_counts_without_nan", + "word_counts", + ] + + values_to_redact = ["first_rows"] + + for field in keys_to_redact: + if field not in column: + continue + is_dict = (isinstance(v, dict) for v in column[field].values()) + if any(is_dict): + column[field] = {k: redact_key(v) for k, v in column[field].items()} + else: + column[field] = redact_key(column[field]) + + for field in values_to_redact: + if field not in column: + continue + is_dict = (isinstance(v, dict) for v in column[field].values()) + if any(is_dict): + column[field] = {k: redact_value(v) for k, v in column[field].items()} + else: + column[field] = redact_value(column[field]) + + return column + + +def redact_summary(summary: dict, config: Settings) -> dict: + """Redact summary to export to json file. + + Args: + summary (dict): summary to redact + + Returns: + dict: redacted summary + """ + for _, col in summary["variables"].items(): + if (config.vars.cat.redact and col["type"] == "Categorical") or ( + config.vars.text.redact and col["type"] == "Text" + ): + col = _redact_column(col) + return summary diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py index 9c3e5ef38..09a1fa374 100644 --- a/src/ydata_profiling/model/summary_algorithms.py +++ b/src/ydata_profiling/model/summary_algorithms.py @@ -74,13 +74,11 @@ def histogram_compute( hist_config = config.plot.histogram - # Compute data range finite = finite_values[np.isfinite(finite_values)] vmin = float(np.min(finite)) vmax = float(np.max(finite)) data_range = vmax - vmin - # Choose of Bins based on observed data values if data_range == 0: eps = 0.5 if vmin == 0 else abs(vmin) * 0.1 bins = np.array([vmin - eps, vmin + eps]) @@ -113,16 +111,13 @@ def chi_square( values: Optional[np.ndarray] = None, histogram: Optional[np.ndarray] = None, ) -> dict: - # Case 1: histogram not passed → we compute it if histogram is None: if values is None: return {"statistic": 0, "pvalue": 0} - # Try NumPy "auto" binning (may fail under NumPy 2) try: bins = np.histogram_bin_edges(values, bins="auto") except ValueError: - # Fallback: basic 1-bin histogram covering the min→max range finite = values[np.isfinite(values)] if finite.size == 0: return {"statistic": 0, "pvalue": 0} @@ -136,7 +131,6 @@ def chi_square( histogram, _ = np.histogram(values, bins=bins) - # Case 2: histogram exists but is empty if histogram.size == 0 or histogram.sum() == 0: return {"statistic": 0, "pvalue": 0} diff --git a/src/ydata_profiling/report/formatters.py b/src/ydata_profiling/report/formatters.py index 199ea854d..06fab6ac3 100644 --- a/src/ydata_profiling/report/formatters.py +++ b/src/ydata_profiling/report/formatters.py @@ -9,24 +9,13 @@ import pandas as pd from markupsafe import escape - -def list_args(func: Callable) -> Callable: - """Extend the function to allow taking a list as the first argument, and apply the function on each of the elements. - - Args: - func: the function to extend - - Returns: - The extended function - """ - - def inner(arg: Any, *args: Any, **kwargs: Any) -> Any: - if isinstance(arg, list): - return [func(v, *args, **kwargs) for v in arg] - - return func(arg, *args, **kwargs) - - return inner +from ydata_profiling.utils.formatters import ( + fmt_array, + fmt_number, + fmt_numeric, + fmt_percent, + list_args, +) @list_args @@ -75,25 +64,6 @@ def fmt_bytesize(num: float, suffix: str = "B") -> str: return f"{num:.1f} Yi{suffix}" -@list_args -def fmt_percent(value: float, edge_cases: bool = True) -> str: - """Format a ratio as a percentage. - - Args: - edge_cases: Check for edge cases? - value: The ratio. - - Returns: - The percentage with 1 point precision. - """ - if edge_cases and round(value, 3) == 0 and value > 0: - return "< 0.1%" - if edge_cases and round(value, 3) == 1 and value < 1: - return "> 99.9%" - - return f"{value*100:2.1f}%" - - @list_args def fmt_timespan(num_seconds: Any, detailed: bool = False, max_units: int = 3) -> str: # From the `humanfriendly` module (without additional dependency) @@ -234,61 +204,6 @@ def fmt_timespan_timedelta( return fmt_numeric(delta, precision) -@list_args -def fmt_numeric(value: float, precision: int = 10) -> str: - """Format any numeric value. - - Args: - value: The numeric value to format. - precision: The numeric precision - - Returns: - The numeric value with the given precision. - """ - if value is None: - fmtted = "N/A" - else: - fmtted = f"{{:.{precision}g}}".format(value) - for v in ["e+", "e-"]: - if v in fmtted: - sign = "-" if v in "e-" else "" - fmtted = fmtted.replace(v, " × 10") + "" - fmtted = fmtted.replace("0", "") - fmtted = fmtted.replace("", f"{sign}") - - return fmtted - - -@list_args -def fmt_number(value: int) -> str: - """Format any numeric value. - - Args: - value: The numeric value to format. - - Returns: - The numeric value with the given precision. - """ - return f"{value:n}" - - -@list_args -def fmt_array(value: np.ndarray, threshold: Any = np.nan) -> str: - """Format numpy arrays. - - Args: - value: Array to format. - threshold: Threshold at which to show ellipsis - - Returns: - The string representation of the numpy array. - """ - with np.printoptions(threshold=3, edgeitems=threshold): - return_value = str(value) - - return return_value - - @list_args def fmt(value: Any) -> str: """Format any value. diff --git a/src/ydata_profiling/utils/formatters.py b/src/ydata_profiling/utils/formatters.py new file mode 100644 index 000000000..1349701f4 --- /dev/null +++ b/src/ydata_profiling/utils/formatters.py @@ -0,0 +1,98 @@ +"""Basic formatting utility functions.""" +from typing import Any, Callable + +import numpy as np +import pandas as pd + + +def list_args(func: Callable) -> Callable: + """Extend the function to allow taking a list as the first argument, and apply the function on each of the elements. + + Args: + func: the function to extend + + Returns: + The extended function + """ + + def inner(arg: Any, *args: Any, **kwargs: Any) -> Any: + if isinstance(arg, list): + return [func(v, *args, **kwargs) for v in arg] + + return func(arg, *args, **kwargs) + + return inner + + +@list_args +def fmt_percent(value: float, edge_cases: bool = True) -> str: + """Format a ratio as a percentage. + + Args: + edge_cases: Check for edge cases? + value: The ratio. + + Returns: + The percentage with 1 point precision. + """ + if edge_cases and round(value, 3) == 0 and value > 0: + return "< 0.1%" + if edge_cases and round(value, 3) == 1 and value < 1: + return "> 99.9%" + + return f"{value*100:2.1f}%" + + +@list_args +def fmt_numeric(value: float, precision: int = 10) -> str: + """Format any numeric value. + + Args: + value: The numeric value to format. + precision: The numeric precision + + Returns: + The numeric value with the given precision. + """ + if value is None: + fmtted = "N/A" + else: + fmtted = f"{{:.{precision}g}}".format(value) + for v in ["e+", "e-"]: + if v in fmtted: + sign = "-" if v in "e-" else "" + fmtted = fmtted.replace(v, " × 10") + "" + fmtted = fmtted.replace("0", "") + fmtted = fmtted.replace("", f"{sign}") + + return fmtted + + +@list_args +def fmt_number(value: int) -> str: + """Format any numeric value. + + Args: + value: The numeric value to format. + + Returns: + The numeric value with the given precision. + """ + return f"{value:n}" + + +@list_args +def fmt_array(value: np.ndarray, threshold: Any = np.nan) -> str: + """Format numpy arrays. + + Args: + value: Array to format. + threshold: Threshold at which to show ellipsis + + Returns: + The string representation of the numpy array. + """ + with np.printoptions(threshold=3, edgeitems=threshold): + return_value = str(value) + + return return_value