diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index 1b16d27a0..6c0bfa193 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -9,27 +9,10 @@ from ydata_profiling.config import Settings from ydata_profiling.model.correlations import perform_check_correlation +from ydata_profiling.utils.formatters import fmt_percent from ydata_profiling.utils.styles import get_alert_styles -def fmt_percent(value: float, edge_cases: bool = True) -> str: - """Format a ratio as a percentage. - - Args: - edge_cases: Check for edge cases? - value: The ratio. - - Returns: - The percentage with 1 point precision. - """ - if edge_cases and round(value, 3) == 0 and value > 0: - return "< 0.1%" - if edge_cases and round(value, 3) == 1 and value < 1: - return "> 99.9%" - - return f"{value*100:2.1f}%" - - @unique class AlertType(Enum): """Alert types""" diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 992c1840c..d4e07418d 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -58,24 +58,3 @@ def handle(self, dtype: str, *args, **kwargs) -> dict: op = compose(funcs) summary = op(*args)[-1] return summary - - -def get_render_map() -> Dict[str, Callable]: - import ydata_profiling.report.structure.variables as render_algorithms - - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..67bd3e7bf 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -1,207 +1,237 @@ -# mypy: ignore-errors - -from dataclasses import asdict -from typing import Any, Callable, Dict, List, Type, Union - -import numpy as np -import pandas as pd -from visions import VisionsBaseType, VisionsTypeset - -from ydata_profiling.config import Settings -from ydata_profiling.model import BaseDescription -from ydata_profiling.model.handler import Handler -from ydata_profiling.model.pandas import ( - pandas_describe_boolean_1d, - pandas_describe_categorical_1d, - pandas_describe_counts, - pandas_describe_date_1d, - pandas_describe_file_1d, - pandas_describe_generic, - pandas_describe_image_1d, - pandas_describe_numeric_1d, - pandas_describe_path_1d, - pandas_describe_text_1d, - pandas_describe_timeseries_1d, - pandas_describe_url_1d, -) -from ydata_profiling.model.pandas.describe_supported_pandas import ( - pandas_describe_supported, -) -from ydata_profiling.model.summary_algorithms import ( # Check what is this method used for - describe_file_1d, - describe_image_1d, - describe_path_1d, - describe_timeseries_1d, - describe_url_1d, -) -from ydata_profiling.utils.backend import is_pyspark_installed - - -class BaseSummarizer(Handler): - """A base summarizer - - Can be used to define custom summarizations - """ - - def summarize( - self, config: Settings, series: pd.Series, dtype: Type[VisionsBaseType] - ) -> dict: - """Generates the summary for a given series""" - return self.handle(str(dtype), config, series, {"type": str(dtype)}) - - -# Revisit this with the correct support for Spark as well. -class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" - - def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): - self.use_spark = use_spark and is_pyspark_installed() - self._summary_map = self._create_summary_map() - super().__init__(self._summary_map, typeset) - - @property - def summary_map(self) -> Dict[str, List[Callable]]: - """Allows users to modify the summary map after initialization.""" - return self._summary_map - - def _create_summary_map(self) -> Dict[str, List[Callable]]: - """Creates the summary map for Pandas summarization.""" - if self.use_spark: - from ydata_profiling.model.spark import ( - describe_boolean_1d_spark, - describe_categorical_1d_spark, - describe_counts_spark, - describe_date_1d_spark, - describe_generic_spark, - describe_numeric_1d_spark, - describe_supported_spark, - describe_text_1d_spark, - ) - - summary_map = { - "Unsupported": [ - describe_counts_spark, - describe_generic_spark, - describe_supported_spark, - ], - "Numeric": [describe_numeric_1d_spark], - "DateTime": [describe_date_1d_spark], - "Text": [describe_text_1d_spark], - "Categorical": [describe_categorical_1d_spark], - "Boolean": [describe_boolean_1d_spark], - "URL": [describe_url_1d], - "Path": [describe_path_1d], - "File": [describe_file_1d], - "Image": [describe_image_1d], - "TimeSeries": [describe_timeseries_1d], - } - else: - summary_map = { - "Unsupported": [ - pandas_describe_counts, - pandas_describe_generic, - pandas_describe_supported, - ], - "Numeric": [pandas_describe_numeric_1d], - "DateTime": [pandas_describe_date_1d], - "Text": [pandas_describe_text_1d], - "Categorical": [pandas_describe_categorical_1d], - "Boolean": [pandas_describe_boolean_1d], - "URL": [pandas_describe_url_1d], - "Path": [pandas_describe_path_1d], - "File": [pandas_describe_file_1d], - "Image": [pandas_describe_image_1d], - "TimeSeries": [pandas_describe_timeseries_1d], - } - return summary_map - - -def format_summary(summary: Union[BaseDescription, dict]) -> dict: - """Prepare summary for export to json file. - - Args: - summary (Union[BaseDescription, dict]): summary to export - - Returns: - dict: summary as dict - """ - - def fmt(v: Any) -> Any: - if isinstance(v, dict): - return {k: fmt(va) for k, va in v.items()} - else: - if isinstance(v, pd.Series): - return fmt(v.to_dict()) - elif ( - isinstance(v, tuple) - and len(v) == 2 - and all(isinstance(x, np.ndarray) for x in v) - ): - return {"counts": v[0].tolist(), "bin_edges": v[1].tolist()} - else: - return v - - if isinstance(summary, BaseDescription): - summary = asdict(summary) - - summary = {k: fmt(v) for k, v in summary.items()} - return summary - - -def _redact_column(column: Dict[str, Any]) -> Dict[str, Any]: - def redact_key(data: Dict[str, Any]) -> Dict[str, Any]: - return {f"REDACTED_{i}": v for i, (_, v) in enumerate(data.items())} - - def redact_value(data: Dict[str, Any]) -> Dict[str, Any]: - return {k: f"REDACTED_{i}" for i, (k, _) in enumerate(data.items())} - - keys_to_redact = [ - "block_alias_char_counts", - "block_alias_values", - "category_alias_char_counts", - "category_alias_values", - "character_counts", - "script_char_counts", - "value_counts_index_sorted", - "value_counts_without_nan", - "word_counts", - ] - - values_to_redact = ["first_rows"] - - for field in keys_to_redact: - if field not in column: - continue - is_dict = (isinstance(v, dict) for v in column[field].values()) - if any(is_dict): - column[field] = {k: redact_key(v) for k, v in column[field].items()} - else: - column[field] = redact_key(column[field]) - - for field in values_to_redact: - if field not in column: - continue - is_dict = (isinstance(v, dict) for v in column[field].values()) - if any(is_dict): - column[field] = {k: redact_value(v) for k, v in column[field].items()} - else: - column[field] = redact_value(column[field]) - - return column - - -def redact_summary(summary: dict, config: Settings) -> dict: - """Redact summary to export to json file. - - Args: - summary (dict): summary to redact - - Returns: - dict: redacted summary - """ - for _, col in summary["variables"].items(): - if (config.vars.cat.redact and col["type"] == "Categorical") or ( - config.vars.text.redact and col["type"] == "Text" - ): - col = _redact_column(col) - return summary +# mypy: ignore-errors + +from dataclasses import asdict +from typing import Any, Callable, Dict, List, Type, Union + +import numpy as np +import pandas as pd +from visions import VisionsBaseType, VisionsTypeset + +from ydata_profiling.config import Settings +from ydata_profiling.model import BaseDescription +from ydata_profiling.model.handler import Handler +from ydata_profiling.utils.backend import is_pyspark_installed + + +class BaseSummarizer(Handler): + """Base class for data summarization. + + Provides a flexible framework to define custom summarization strategies + for different data types and dataframe backends. + """ + + def summarize( + self, config: Settings, series: pd.Series, dtype: Type[VisionsBaseType] + ) -> dict: + """Generates the summary statistics for a given series. + + Args: + config: Report configuration settings + series: Data series to summarize + dtype: Detected data type from visions typeset + + Returns: + Dictionary containing summary statistics + """ + return self.handle(str(dtype), config, series, {"type": str(dtype)}) + + +def _create_pandas_summary_map() -> Dict[str, List[Callable]]: + """Create summary function mapping for Pandas backend.""" + from ydata_profiling.model.pandas import ( + pandas_describe_boolean_1d, + pandas_describe_categorical_1d, + pandas_describe_counts, + pandas_describe_date_1d, + pandas_describe_file_1d, + pandas_describe_generic, + pandas_describe_image_1d, + pandas_describe_numeric_1d, + pandas_describe_path_1d, + pandas_describe_text_1d, + pandas_describe_timeseries_1d, + pandas_describe_url_1d, + ) + from ydata_profiling.model.pandas.describe_supported_pandas import ( + pandas_describe_supported, + ) + + return { + "Unsupported": [ + pandas_describe_counts, + pandas_describe_generic, + pandas_describe_supported, + ], + "Numeric": [pandas_describe_numeric_1d], + "DateTime": [pandas_describe_date_1d], + "Text": [pandas_describe_text_1d], + "Categorical": [pandas_describe_categorical_1d], + "Boolean": [pandas_describe_boolean_1d], + "URL": [pandas_describe_url_1d], + "Path": [pandas_describe_path_1d], + "File": [pandas_describe_file_1d], + "Image": [pandas_describe_image_1d], + "TimeSeries": [pandas_describe_timeseries_1d], + } + + +def _create_spark_summary_map() -> Dict[str, List[Callable]]: + """Create summary function mapping for Spark backend.""" + from ydata_profiling.model.spark import ( + describe_boolean_1d_spark, + describe_categorical_1d_spark, + describe_counts_spark, + describe_date_1d_spark, + describe_generic_spark, + describe_numeric_1d_spark, + describe_supported_spark, + describe_text_1d_spark, + ) + from ydata_profiling.model.summary_algorithms import ( + describe_file_1d, + describe_image_1d, + describe_path_1d, + describe_timeseries_1d, + describe_url_1d, + ) + + return { + "Unsupported": [ + describe_counts_spark, + describe_generic_spark, + describe_supported_spark, + ], + "Numeric": [describe_numeric_1d_spark], + "DateTime": [describe_date_1d_spark], + "Text": [describe_text_1d_spark], + "Categorical": [describe_categorical_1d_spark], + "Boolean": [describe_boolean_1d_spark], + "URL": [describe_url_1d], + "Path": [describe_path_1d], + "File": [describe_file_1d], + "Image": [describe_image_1d], + "TimeSeries": [describe_timeseries_1d], + } + + +def _create_summary_map_factory(use_spark: bool) -> Dict[str, List[Callable]]: + """Factory function to create appropriate summary map based on backend. + + Args: + use_spark: If True, create Spark-compatible summary map + + Returns: + Mapping from data types to summary functions + """ + if use_spark: + return _create_spark_summary_map() + return _create_pandas_summary_map() + + +class ProfilingSummarizer(BaseSummarizer): + """Standard summarizer for data profiling. + + Supports both Pandas and Spark backends, providing comprehensive + statistical summaries for all standard data types. + """ + + def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): + self.use_spark = use_spark and is_pyspark_installed() + self._summary_map = _create_summary_map_factory(self.use_spark) + super().__init__(self._summary_map, typeset) + + @property + def summary_map(self) -> Dict[str, List[Callable]]: + """Allows users to modify the summary map after initialization.""" + return self._summary_map + + +def format_summary(summary: Union[BaseDescription, dict]) -> dict: + """Prepare summary for export to json file. + + Args: + summary (Union[BaseDescription, dict]): summary to export + + Returns: + dict: summary as dict + """ + + def fmt(v: Any) -> Any: + if isinstance(v, dict): + return {k: fmt(va) for k, va in v.items()} + else: + if isinstance(v, pd.Series): + return fmt(v.to_dict()) + elif ( + isinstance(v, tuple) + and len(v) == 2 + and all(isinstance(x, np.ndarray) for x in v) + ): + return {"counts": v[0].tolist(), "bin_edges": v[1].tolist()} + else: + return v + + if isinstance(summary, BaseDescription): + summary = asdict(summary) + + summary = {k: fmt(v) for k, v in summary.items()} + return summary + + +def _redact_column(column: Dict[str, Any]) -> Dict[str, Any]: + def redact_key(data: Dict[str, Any]) -> Dict[str, Any]: + return {f"REDACTED_{i}": v for i, (_, v) in enumerate(data.items())} + + def redact_value(data: Dict[str, Any]) -> Dict[str, Any]: + return {k: f"REDACTED_{i}" for i, (k, _) in enumerate(data.items())} + + keys_to_redact = [ + "block_alias_char_counts", + "block_alias_values", + "category_alias_char_counts", + "category_alias_values", + "character_counts", + "script_char_counts", + "value_counts_index_sorted", + "value_counts_without_nan", + "word_counts", + ] + + values_to_redact = ["first_rows"] + + for field in keys_to_redact: + if field not in column: + continue + is_dict = (isinstance(v, dict) for v in column[field].values()) + if any(is_dict): + column[field] = {k: redact_key(v) for k, v in column[field].items()} + else: + column[field] = redact_key(column[field]) + + for field in values_to_redact: + if field not in column: + continue + is_dict = (isinstance(v, dict) for v in column[field].values()) + if any(is_dict): + column[field] = {k: redact_value(v) for k, v in column[field].items()} + else: + column[field] = redact_value(column[field]) + + return column + + +def redact_summary(summary: dict, config: Settings) -> dict: + """Redact summary to export to json file. + + Args: + summary (dict): summary to redact + + Returns: + dict: redacted summary + """ + for _, col in summary["variables"].items(): + if (config.vars.cat.redact and col["type"] == "Categorical") or ( + config.vars.text.redact and col["type"] == "Text" + ): + col = _redact_column(col) + return summary diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py index 9c3e5ef38..09a1fa374 100644 --- a/src/ydata_profiling/model/summary_algorithms.py +++ b/src/ydata_profiling/model/summary_algorithms.py @@ -74,13 +74,11 @@ def histogram_compute( hist_config = config.plot.histogram - # Compute data range finite = finite_values[np.isfinite(finite_values)] vmin = float(np.min(finite)) vmax = float(np.max(finite)) data_range = vmax - vmin - # Choose of Bins based on observed data values if data_range == 0: eps = 0.5 if vmin == 0 else abs(vmin) * 0.1 bins = np.array([vmin - eps, vmin + eps]) @@ -113,16 +111,13 @@ def chi_square( values: Optional[np.ndarray] = None, histogram: Optional[np.ndarray] = None, ) -> dict: - # Case 1: histogram not passed → we compute it if histogram is None: if values is None: return {"statistic": 0, "pvalue": 0} - # Try NumPy "auto" binning (may fail under NumPy 2) try: bins = np.histogram_bin_edges(values, bins="auto") except ValueError: - # Fallback: basic 1-bin histogram covering the min→max range finite = values[np.isfinite(values)] if finite.size == 0: return {"statistic": 0, "pvalue": 0} @@ -136,7 +131,6 @@ def chi_square( histogram, _ = np.histogram(values, bins=bins) - # Case 2: histogram exists but is empty if histogram.size == 0 or histogram.sum() == 0: return {"statistic": 0, "pvalue": 0} diff --git a/src/ydata_profiling/report/formatters.py b/src/ydata_profiling/report/formatters.py index 199ea854d..06fab6ac3 100644 --- a/src/ydata_profiling/report/formatters.py +++ b/src/ydata_profiling/report/formatters.py @@ -9,24 +9,13 @@ import pandas as pd from markupsafe import escape - -def list_args(func: Callable) -> Callable: - """Extend the function to allow taking a list as the first argument, and apply the function on each of the elements. - - Args: - func: the function to extend - - Returns: - The extended function - """ - - def inner(arg: Any, *args: Any, **kwargs: Any) -> Any: - if isinstance(arg, list): - return [func(v, *args, **kwargs) for v in arg] - - return func(arg, *args, **kwargs) - - return inner +from ydata_profiling.utils.formatters import ( + fmt_array, + fmt_number, + fmt_numeric, + fmt_percent, + list_args, +) @list_args @@ -75,25 +64,6 @@ def fmt_bytesize(num: float, suffix: str = "B") -> str: return f"{num:.1f} Yi{suffix}" -@list_args -def fmt_percent(value: float, edge_cases: bool = True) -> str: - """Format a ratio as a percentage. - - Args: - edge_cases: Check for edge cases? - value: The ratio. - - Returns: - The percentage with 1 point precision. - """ - if edge_cases and round(value, 3) == 0 and value > 0: - return "< 0.1%" - if edge_cases and round(value, 3) == 1 and value < 1: - return "> 99.9%" - - return f"{value*100:2.1f}%" - - @list_args def fmt_timespan(num_seconds: Any, detailed: bool = False, max_units: int = 3) -> str: # From the `humanfriendly` module (without additional dependency) @@ -234,61 +204,6 @@ def fmt_timespan_timedelta( return fmt_numeric(delta, precision) -@list_args -def fmt_numeric(value: float, precision: int = 10) -> str: - """Format any numeric value. - - Args: - value: The numeric value to format. - precision: The numeric precision - - Returns: - The numeric value with the given precision. - """ - if value is None: - fmtted = "N/A" - else: - fmtted = f"{{:.{precision}g}}".format(value) - for v in ["e+", "e-"]: - if v in fmtted: - sign = "-" if v in "e-" else "" - fmtted = fmtted.replace(v, " × 10") + "" - fmtted = fmtted.replace("0", "") - fmtted = fmtted.replace("", f"{sign}") - - return fmtted - - -@list_args -def fmt_number(value: int) -> str: - """Format any numeric value. - - Args: - value: The numeric value to format. - - Returns: - The numeric value with the given precision. - """ - return f"{value:n}" - - -@list_args -def fmt_array(value: np.ndarray, threshold: Any = np.nan) -> str: - """Format numpy arrays. - - Args: - value: Array to format. - threshold: Threshold at which to show ellipsis - - Returns: - The string representation of the numpy array. - """ - with np.printoptions(threshold=3, edgeitems=threshold): - return_value = str(value) - - return return_value - - @list_args def fmt(value: Any) -> str: """Format any value. diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 482b410b2..64bec9fd8 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -1,5 +1,5 @@ import os -from typing import List, Sequence +from typing import Callable, Dict, List, Sequence import pandas as pd from tqdm.auto import tqdm @@ -7,7 +7,6 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.model.handler import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, @@ -24,6 +23,30 @@ from ydata_profiling.utils.dataframe import slugify +def get_render_map() -> Dict[str, Callable]: + """Create mapping from data types to rendering functions. + + Returns: + Dictionary mapping data type names to their respective render functions + """ + import ydata_profiling.report.structure.variables as render_algorithms + + return { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + def get_missing_items(config: Settings, summary: BaseDescription) -> list: """Return the missing diagrams diff --git a/src/ydata_profiling/utils/formatters.py b/src/ydata_profiling/utils/formatters.py new file mode 100644 index 000000000..1349701f4 --- /dev/null +++ b/src/ydata_profiling/utils/formatters.py @@ -0,0 +1,98 @@ +"""Basic formatting utility functions.""" +from typing import Any, Callable + +import numpy as np +import pandas as pd + + +def list_args(func: Callable) -> Callable: + """Extend the function to allow taking a list as the first argument, and apply the function on each of the elements. + + Args: + func: the function to extend + + Returns: + The extended function + """ + + def inner(arg: Any, *args: Any, **kwargs: Any) -> Any: + if isinstance(arg, list): + return [func(v, *args, **kwargs) for v in arg] + + return func(arg, *args, **kwargs) + + return inner + + +@list_args +def fmt_percent(value: float, edge_cases: bool = True) -> str: + """Format a ratio as a percentage. + + Args: + edge_cases: Check for edge cases? + value: The ratio. + + Returns: + The percentage with 1 point precision. + """ + if edge_cases and round(value, 3) == 0 and value > 0: + return "< 0.1%" + if edge_cases and round(value, 3) == 1 and value < 1: + return "> 99.9%" + + return f"{value*100:2.1f}%" + + +@list_args +def fmt_numeric(value: float, precision: int = 10) -> str: + """Format any numeric value. + + Args: + value: The numeric value to format. + precision: The numeric precision + + Returns: + The numeric value with the given precision. + """ + if value is None: + fmtted = "N/A" + else: + fmtted = f"{{:.{precision}g}}".format(value) + for v in ["e+", "e-"]: + if v in fmtted: + sign = "-" if v in "e-" else "" + fmtted = fmtted.replace(v, " × 10") + "" + fmtted = fmtted.replace("0", "") + fmtted = fmtted.replace("", f"{sign}") + + return fmtted + + +@list_args +def fmt_number(value: int) -> str: + """Format any numeric value. + + Args: + value: The numeric value to format. + + Returns: + The numeric value with the given precision. + """ + return f"{value:n}" + + +@list_args +def fmt_array(value: np.ndarray, threshold: Any = np.nan) -> str: + """Format numpy arrays. + + Args: + value: Array to format. + threshold: Threshold at which to show ellipsis + + Returns: + The string representation of the numpy array. + """ + with np.printoptions(threshold=3, edgeitems=threshold): + return_value = str(value) + + return return_value diff --git a/venv/Dockerfile b/venv/Dockerfile new file mode 100644 index 000000000..9e0a68801 --- /dev/null +++ b/venv/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \ + pip install --no-cache-dir . && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \ + pip install --no-cache-dir jupyter + +EXPOSE 8888 + +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]