diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..7bb15bf5d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \ + pip install --no-cache-dir . && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \ + pip install --no-cache-dir jupyter + +EXPOSE 8888 + +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"] + + diff --git a/src/ydata_profiling/model/correlations.py b/src/ydata_profiling/model/correlations.py index 2bbaa1112..25e2e13c4 100644 --- a/src/ydata_profiling/model/correlations.py +++ b/src/ydata_profiling/model/correlations.py @@ -3,12 +3,13 @@ """Correlations between variables.""" import warnings -from typing import Dict, List, Optional, Sized, no_type_check +from typing import Dict, List, Optional, Sized import numpy as np import pandas as pd from ydata_profiling.config import Settings +from ydata_profiling.utils.backend import BaseBackend try: from pandas.core.base import DataError @@ -16,30 +17,11 @@ from pandas.errors import DataError -class CorrelationBackend: +class CorrelationBackend(BaseBackend): """Helper class to select and cache the appropriate correlation backend (Pandas or Spark).""" - @no_type_check - def __init__(self, df: Sized): - """Determine backend once and store it for all correlation computations.""" - if isinstance(df, pd.DataFrame): - from ydata_profiling.model.pandas import ( - correlations_pandas as correlation_backend, # type: ignore - ) - else: - from ydata_profiling.model.spark import ( - correlations_spark as correlation_backend, # type: ignore - ) - - self.backend = correlation_backend - - def get_method(self, method_name: str): # noqa: ANN201 - """Retrieve the appropriate correlation method class from the backend.""" - if hasattr(self.backend, method_name): - return getattr(self.backend, method_name) - raise AttributeError( - f"Correlation method '{method_name}' is not available in the backend." - ) + _pandas_module = "ydata_profiling.model.pandas.correlations_pandas" + _spark_module = "ydata_profiling.model.spark.correlations_spark" class Correlation: diff --git a/src/ydata_profiling/model/missing.py b/src/ydata_profiling/model/missing.py index 46ec2dee3..aa14cc425 100644 --- a/src/ydata_profiling/model/missing.py +++ b/src/ydata_profiling/model/missing.py @@ -1,32 +1,17 @@ -import importlib import warnings -from typing import Any, Callable, Dict, Optional, Sized +from typing import Any, Dict, Optional, Sized import pandas as pd from ydata_profiling.config import Settings +from ydata_profiling.utils.backend import BaseBackend -class MissingDataBackend: +class MissingDataBackend(BaseBackend): """Helper class to select and cache the appropriate missing-data backend (Pandas or Spark).""" - def __init__(self, df: Sized): - """Determine backend once and store it for all missing-data computations.""" - if isinstance(df, pd.DataFrame): - self.backend_module = "ydata_profiling.model.pandas.missing_pandas" - else: - self.backend_module = "ydata_profiling.model.spark.missing_spark" - - self.module = importlib.import_module(self.backend_module) - - def get_method(self, method_name: str) -> Callable: - """Retrieve the appropriate missing-data function from the backend module.""" - try: - return getattr(self.module, method_name) - except AttributeError as ex: - raise AttributeError( - f"Missing-data function '{method_name}' is not available in {self.backend_module}." - ) from ex + _pandas_module = "ydata_profiling.model.pandas.missing_pandas" + _spark_module = "ydata_profiling.model.spark.missing_spark" class MissingData: diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py index a919ee33b..28c79f849 100644 --- a/src/ydata_profiling/model/pandas/table_pandas.py +++ b/src/ydata_profiling/model/pandas/table_pandas.py @@ -1,9 +1,7 @@ -from collections import Counter - import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.model.table import get_table_stats +from ydata_profiling.model.table import compute_common_table_stats, get_table_stats @get_table_stats.register @@ -21,36 +19,18 @@ def pandas_get_table_stats( A dictionary that contains the table statistics. """ n = len(df) if not df.empty else 0 + n_var = len(df.columns) memory_size = df.memory_usage(deep=config.memory_deep).sum() record_size = float(memory_size) / n if n > 0 else 0 table_stats = { "n": n, - "n_var": len(df.columns), + "n_var": n_var, "memory_size": memory_size, "record_size": record_size, - "n_cells_missing": 0, - "n_vars_with_missing": 0, - "n_vars_all_missing": 0, } - for series_summary in variable_stats.values(): - if "n_missing" in series_summary and series_summary["n_missing"] > 0: - table_stats["n_vars_with_missing"] += 1 - table_stats["n_cells_missing"] += series_summary["n_missing"] - if series_summary["n_missing"] == n: - table_stats["n_vars_all_missing"] += 1 - - table_stats["p_cells_missing"] = ( - table_stats["n_cells_missing"] / (table_stats["n"] * table_stats["n_var"]) - if table_stats["n"] > 0 and table_stats["n_var"] > 0 - else 0 - ) - - # Variable type counts - table_stats.update( - {"types": dict(Counter([v["type"] for v in variable_stats.values()]))} - ) + table_stats.update(compute_common_table_stats(n, n_var, variable_stats)) return table_stats diff --git a/src/ydata_profiling/model/spark/table_spark.py b/src/ydata_profiling/model/spark/table_spark.py index 33e862e61..2a2985059 100644 --- a/src/ydata_profiling/model/spark/table_spark.py +++ b/src/ydata_profiling/model/spark/table_spark.py @@ -1,9 +1,7 @@ -from collections import Counter - from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.table import get_table_stats +from ydata_profiling.model.table import compute_common_table_stats, get_table_stats @get_table_stats.register @@ -21,38 +19,9 @@ def get_table_stats_spark( A dictionary that contains the table statistics. """ n = df.count() + n_var = len(df.columns) - result = {"n": n, "n_var": len(df.columns)} - - table_stats = { - "n_cells_missing": 0, - "n_vars_with_missing": 0, - "n_vars_all_missing": 0, - } - - for series_summary in variable_stats.values(): - if "n_missing" in series_summary and series_summary["n_missing"] > 0: - table_stats["n_vars_with_missing"] += 1 - table_stats["n_cells_missing"] += series_summary["n_missing"] - if series_summary["n_missing"] == n: - table_stats["n_vars_all_missing"] += 1 - - # without this check we'll get a div by zero error - if result["n"] * result["n_var"] > 0: - table_stats["p_cells_missing"] = ( - table_stats["n_cells_missing"] / (result["n"] * result["n_var"]) - if result["n"] > 0 - else 0 - ) - else: - table_stats["p_cells_missing"] = 0 - - result["p_cells_missing"] = table_stats["p_cells_missing"] - result["n_cells_missing"] = table_stats["n_cells_missing"] - result["n_vars_all_missing"] = table_stats["n_vars_all_missing"] - result["n_vars_with_missing"] = table_stats["n_vars_with_missing"] - - # Variable type counts - result["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) + result = {"n": n, "n_var": n_var} + result.update(compute_common_table_stats(n, n_var, variable_stats)) return result diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py index e8145d76c..a31f25ccf 100644 --- a/src/ydata_profiling/model/spark/timeseries_index_spark.py +++ b/src/ydata_profiling/model/spark/timeseries_index_spark.py @@ -4,7 +4,7 @@ from ydata_profiling.config import Settings -def spark_get_time_index_description_spark( +def get_time_index_description_spark( config: Settings, df: DataFrame, table_stats: dict, diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..41b8d6f88 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,9 +50,8 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """A summarizer supporting both Pandas and Spark DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() diff --git a/src/ydata_profiling/model/table.py b/src/ydata_profiling/model/table.py index e5eb6fdc2..6f5c7305d 100644 --- a/src/ydata_profiling/model/table.py +++ b/src/ydata_profiling/model/table.py @@ -1,3 +1,4 @@ +from collections import Counter from typing import Any from multimethod import multimethod @@ -5,6 +6,42 @@ from ydata_profiling.config import Settings +def compute_common_table_stats( + n: int, n_var: int, variable_stats: dict +) -> dict: + """Compute common table statistics shared by Pandas and Spark backends. + + Args: + n: Number of rows in the DataFrame + n_var: Number of columns (variables) + variable_stats: Previously calculated statistic on the DataFrame series + + Returns: + A dictionary with common table statistics: missing values counts, percentages, and type counts + """ + table_stats = { + "n_cells_missing": 0, + "n_vars_with_missing": 0, + "n_vars_all_missing": 0, + } + + for series_summary in variable_stats.values(): + if "n_missing" in series_summary and series_summary["n_missing"] > 0: + table_stats["n_vars_with_missing"] += 1 + table_stats["n_cells_missing"] += series_summary["n_missing"] + if series_summary["n_missing"] == n: + table_stats["n_vars_all_missing"] += 1 + + total_cells = n * n_var + table_stats["p_cells_missing"] = ( + table_stats["n_cells_missing"] / total_cells if total_cells > 0 else 0 + ) + + table_stats["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) + + return table_stats + + @multimethod def get_table_stats(config: Settings, df: Any, variable_stats: dict) -> dict: raise NotImplementedError() diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py index e99d91c11..1cee2aea8 100644 --- a/src/ydata_profiling/utils/backend.py +++ b/src/ydata_profiling/utils/backend.py @@ -1,9 +1,41 @@ """ - File with a function to check the backend being used + File with backend utilities and helper functions to check the backend being used """ import importlib +from typing import Callable, Optional, Sized, Union + +import pandas as pd def is_pyspark_installed() -> bool: """Check if PySpark is installed without importing it.""" return importlib.util.find_spec("pyspark") is not None + + +class BaseBackend: + """Base helper class to select and cache the appropriate backend (Pandas or Spark).""" + + _pandas_module: Optional[str] = None + _spark_module: Optional[str] = None + + def __init__(self, df: Union[pd.DataFrame, Sized]): + """Determine backend once and store it for all computations.""" + if isinstance(df, pd.DataFrame): + module_path = self._pandas_module + else: + module_path = self._spark_module + + if module_path is None: + raise ValueError("Backend module path not configured") + + self.module = importlib.import_module(module_path) + self.module_path = module_path + + def get_method(self, method_name: str) -> Callable: + """Retrieve the appropriate function from the backend module.""" + try: + return getattr(self.module, method_name) + except AttributeError as ex: + raise AttributeError( + f"Function '{method_name}' is not available in {self.module_path}." + ) from ex