Data-Centric-AI-Community
diff --git a/‎.github/workflows/pull-request.yml‎
Lines changed: 8 additions & 5 deletions b/‎.github/workflows/pull-request.yml‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 0 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/ydata_profiling/model/correlations.py‎
Lines changed: 50 additions & 34 deletions b/‎src/ydata_profiling/model/correlations.py‎
Lines changed: 50 additions & 34 deletions
diff --git a/‎src/ydata_profiling/model/dataframe.py‎
Lines changed: 25 additions & 6 deletions b/‎src/ydata_profiling/model/dataframe.py‎
Lines changed: 25 additions & 6 deletions
diff --git a/‎src/ydata_profiling/model/describe.py‎
Lines changed: 23 additions & 8 deletions b/‎src/ydata_profiling/model/describe.py‎
Lines changed: 23 additions & 8 deletions
diff --git a/‎src/ydata_profiling/model/handler.py‎
Lines changed: 15 additions & 17 deletions b/‎src/ydata_profiling/model/handler.py‎
Lines changed: 15 additions & 17 deletions
@@ -37,10 +37,10 @@ jobs:
         git config user.name "Azory YData Bot"
         git config core.autocrlf false
 
-    - name: Set up Python 3.8
+    - name: Set up Python 3.11
       uses: actions/setup-python@v5
       with:
-        python-version: "3.13"
+        python-version: "3.11"
 
     - uses: actions/cache@v4
       name: Cache pip dependencies
@@ -52,9 +52,12 @@ jobs:
 
     - name: Install pip dependencies
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip setuptools
         python -m pip install ".[dev,test]"
 
+    - name: Install pre-commit hooks
+      run: pre-commit install --install-hooks
+
     - name: Install the package
       run: make install
 
@@ -87,10 +90,10 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
-    - name: Setup Python
+    - name: Setup Python 3.11
       uses: actions/setup-python@v5
       with:
-        python-version: "3.13"
+        python-version: "3.11"
 
     - name: Cache pip dependencies
       id: cache
 
@@ -95,7 +95,6 @@ test = [
     "coverage>=6.5, <8",
     "codecov",
     "pytest-cov",
-    "pytest-spark",
     "nbval",
     "pyarrow",
     "twine>=3.1.1",
 
@@ -4,7 +4,6 @@
 
 import numpy as np
 import pandas as pd
-from multimethod import multimethod
 
 from ydata_profiling.config import Settings
 
@@ -14,52 +13,70 @@
     from pandas.errors import DataError
 
 
+class CorrelationBackend:
+    """Helper class to select and cache the appropriate correlation backend (Pandas or Spark)."""
+
+    def __init__(self, df: Sized):
+        """Determine backend once and store it for all correlation computations."""
+        if isinstance(df, pd.DataFrame):
+            from ydata_profiling.model.pandas import (
+                correlations_pandas as correlation_backend, #type: ignore
+            )
+        else:
+            from ydata_profiling.model.spark import (
+                correlations_spark as correlation_backend, # type: ignore
+            )
+
+        self.backend = correlation_backend
+
+    def get_method(self, method_name: str):
+        """Retrieve the appropriate correlation method class from the backend."""
+        if hasattr(self.backend, method_name):
+            return getattr(self.backend, method_name)
+        raise AttributeError(
+            f"Correlation method '{method_name}' is not available in the backend."
+        )
+
+
 class Correlation:
-    @staticmethod
-    def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
-        raise NotImplementedError()
+    _method_name: str = ""
+
+    def compute(
+        self, config: Settings, df: Sized, summary: dict, backend: CorrelationBackend
+    ) -> Optional[Sized]:
+        """Computes correlation using the correct backend (Pandas or Spark)."""
+        try:
+            method = backend.get_method(self._method_name)
+        except AttributeError as ex:
+            raise NotImplementedError() from ex
+        else:
+            return method(config, df, summary)
 
 
 class Auto(Correlation):
-    @staticmethod
-    @multimethod
-    def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
-        raise NotImplementedError()
+    """Automatically selects the appropriate correlation method based on the DataFrame type."""
+
+    _method_name = "auto_compute"
 
 
 class Spearman(Correlation):
-    @staticmethod
-    @multimethod
-    def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
-        raise NotImplementedError()
+    _method_name = "spearman_compute"
 
 
 class Pearson(Correlation):
-    @staticmethod
-    @multimethod
-    def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
-        raise NotImplementedError()
+    _method_name = "pearson_compute"
 
 
 class Kendall(Correlation):
-    @staticmethod
-    @multimethod
-    def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
-        raise NotImplementedError()
+    _method_name = "kendall_compute"
 
 
 class Cramers(Correlation):
-    @staticmethod
-    @multimethod
-    def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
-        raise NotImplementedError()
+    _method_name = "cramers_compute"
 
 
 class PhiK(Correlation):
-    @staticmethod
-    @multimethod
-    def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
-        raise NotImplementedError()
+    _method_name = "phik_compute"
 
 
 def warn_correlation(correlation_name: str, error: str) -> None:
@@ -88,6 +105,8 @@ def calculate_correlation(
     Returns:
         The correlation matrices for the given correlation measures. Return None if correlation is empty.
     """
+    backend = CorrelationBackend(df)
+
     correlation_measures = {
         "auto": Auto,
         "pearson": Pearson,
@@ -99,16 +118,13 @@ def calculate_correlation(
 
     correlation = None
     try:
-        correlation = correlation_measures[correlation_name].compute(
-            config, df, summary
+        correlation = correlation_measures[correlation_name]().compute(
+            config, df, summary, backend
         )
     except (ValueError, AssertionError, TypeError, DataError, IndexError) as e:
         warn_correlation(correlation_name, str(e))
 
-    if correlation is not None and len(correlation) <= 0:
-        correlation = None
-
-    return correlation
+    return correlation if correlation is not None and len(correlation) > 0 else None
 
 
 def perform_check_correlation(
 
@@ -1,15 +1,34 @@
+import importlib
 from typing import Any
 
-from multimethod import multimethod
+import pandas as pd
 
-from ydata_profiling.config import Settings
+spec = importlib.util.find_spec("pyspark")
+if spec is None:
+    from typing import TypeVar
 
+    sparkDataFrame = TypeVar("sparkDataFrame")
+else:
+    from pyspark.sql import DataFrame as sparkDataFrame  # type: ignore
+    from ydata_profiling.model.spark.dataframe_spark import spark_preprocess
 
-@multimethod
-def check_dataframe(df: Any) -> None:
-    raise NotImplementedError()
+from ydata_profiling.config import Settings
+from ydata_profiling.model.pandas.dataframe_pandas import pandas_preprocess
 
 
-@multimethod
 def preprocess(config: Settings, df: Any) -> Any:
+    """
+    Search for invalid columns datatypes as well as ensures column names follow the expected rules
+    Args:
+        config: ydataprofiling Settings class
+        df: a pandas or spark dataframe
+
+    Returns: a pandas or spark dataframe
+    """
+    if isinstance(df, pd.DataFrame):
+        df = pandas_preprocess(config=config, df=df)
+    elif isinstance(df, sparkDataFrame):  # type: ignore
+        df = spark_preprocess(config=config, df=df)
+    else:
+        return NotImplementedError()
     return df
@@ -1,6 +1,6 @@
 """Organize the calculation of statistics for each series in this DataFrame."""
 from datetime import datetime
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 
 import pandas as pd
 from tqdm.auto import tqdm
@@ -13,7 +13,7 @@
     calculate_correlation,
     get_active_correlations,
 )
-from ydata_profiling.model.dataframe import check_dataframe, preprocess
+from ydata_profiling.model.dataframe import preprocess
 from ydata_profiling.model.description import TimeIndexAnalysis
 from ydata_profiling.model.duplicates import get_duplicates
 from ydata_profiling.model.missing import get_missing_active, get_missing_diagram
@@ -29,11 +29,11 @@
 
 def describe(
     config: Settings,
-    df: pd.DataFrame,
+    df: Union[pd.DataFrame, "pyspark.sql.DataFrame"],  # type: ignore
     summarizer: BaseSummarizer,
     typeset: VisionsTypeset,
     sample: Optional[dict] = None,
-) -> BaseDescription:
+) -> BaseDescription:  # noqa: TC301
     """Calculate the statistics for each series in this DataFrame.
 
     Args:
@@ -52,11 +52,26 @@ def describe(
             - alerts: direct special attention to these patterns in your data.
             - package: package details.
     """
+    # ** Validate Input types **
+    if not isinstance(config, Settings):
+        raise TypeError(f"`config` must be of type `Settings`, got {type(config)}")
+
+    # Validate df input type
+
+    if not isinstance(df, pd.DataFrame):
+        try:
+            from pyspark.sql import DataFrame as SparkDataFrame  # type: ignore
+
+            if not isinstance(df, SparkDataFrame):  # noqa: TC301
+                raise TypeError(  # noqa: TC301
+                    f"`df` must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}."
+                )
+        except ImportError as ex:
+            raise TypeError(
+                f"`df must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}."
+                f"If using Spark, make sure PySpark is installed."
+            ) from ex
 
-    if df is None:
-        raise ValueError("Can not describe a `lazy` ProfileReport without a DataFrame.")
-
-    check_dataframe(df)
     df = preprocess(config, df)
 
     number_of_tasks = 5
 
@@ -1,4 +1,6 @@
-from functools import reduce
+"""
+    Auxiliary handler methods for data summary extraction
+"""
 from typing import Any, Callable, Dict, List, Sequence
 
 import networkx as nx
@@ -7,22 +9,19 @@
 
 def compose(functions: Sequence[Callable]) -> Callable:
     """
-    Compose a sequence of functions
+    Compose a sequence of functions.
+
     :param functions: sequence of functions
-    :return: combined functions, e.g. [f(x), g(x)] -> g(f(x))
+    :return: combined function applying all functions in order.
     """
 
-    def func(f: Callable, g: Callable) -> Callable:
-        def func2(*x) -> Any:
-            res = g(*x)
-            if type(res) == bool:
-                return f(*x)
-            else:
-                return f(*res)
-
-        return func2
+    def composed_function(*args):
+        result = args  # Start with the input arguments
+        for func in functions:
+            result = func(*result) if isinstance(result, tuple) else func(result)
+        return result
 
-    return reduce(func, reversed(functions), lambda *x: x)
+    return composed_function
 
 
 class Handler:
@@ -40,7 +39,6 @@ def __init__(
     ):
         self.mapping = mapping
         self.typeset = typeset
-
         self._complete_dag()
 
     def _complete_dag(self) -> None:
@@ -53,13 +51,13 @@ def _complete_dag(self) -> None:
 
     def handle(self, dtype: str, *args, **kwargs) -> dict:
         """
-
         Returns:
-            object:
+            object: a tuple containing the config, the dataset series and the summary extracted
         """
         funcs = self.mapping.get(dtype, [])
         op = compose(funcs)
-        return op(*args)
+        summary = op(*args)[-1]
+        return summary
 
 
 def get_render_map() -> Dict[str, Callable]: