Skip to content

Commit 415616c

Browse files
fix: improve profiling code logic (#1728)
* fix: refactor logic for ydata-profiling (pandas, spark summarizer) * fix: update correlations and missing data logic update tests to match the new flows * fix: refactor logic for ydata-profiling (pandas, spark summarizer) * fix: update correlations and missing data logic update tests to match the new flows * chore: update python version for linter Use python 3.12 to validate the linter and docs * fix(linting): code formatting * fix: linter suggested fixes * fix(linting): code formatting * fix: fix linter message to improve if else with exception * fix(linting): code formatting * fix: linter improvements * fix(linting): code formatting * fix: linter fixing tc301 * fix: rm unecessary unit tes * fix(linting): code formatting * fix: spark correlations code * fix(linting): code formatting * fix: spark register for missing_heatmat * fix(linting): code formatting * fix: avoid circular imports * fix(linting): code formatting * fix: correct import * fix: rm import * fix: fix typing and import type * fix: linter fix related with imports * chore: fix typing for certain classes * fix(linting): code formatting * fix(linting): code formatting * fix: fix spark tests to align with the most up-to-date code * fix(linting): code formatting * chore: reverse to python 3.11 * fix: removed wrong import * fix: spark implementation * fix: spark describe implementation to support more recent spark version update spark tests and improve spark implementation * chore: ignore linting errors that are not correct * chore: ignore typing check * chore: ignore unecessary type checks * chore: linting ignore of imports --------- Co-authored-by: Azory YData Bot <azory@ydata.ai>
1 parent ed566d5 commit 415616c

45 files changed

Lines changed: 706 additions & 596 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/pull-request.yml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ jobs:
3737
git config user.name "Azory YData Bot"
3838
git config core.autocrlf false
3939
40-
- name: Set up Python 3.8
40+
- name: Set up Python 3.11
4141
uses: actions/setup-python@v5
4242
with:
43-
python-version: "3.13"
43+
python-version: "3.11"
4444

4545
- uses: actions/cache@v4
4646
name: Cache pip dependencies
@@ -52,9 +52,12 @@ jobs:
5252
5353
- name: Install pip dependencies
5454
run: |
55-
python -m pip install --upgrade pip
55+
python -m pip install --upgrade pip setuptools
5656
python -m pip install ".[dev,test]"
5757
58+
- name: Install pre-commit hooks
59+
run: pre-commit install --install-hooks
60+
5861
- name: Install the package
5962
run: make install
6063

@@ -87,10 +90,10 @@ jobs:
8790
steps:
8891
- uses: actions/checkout@v4
8992

90-
- name: Setup Python
93+
- name: Setup Python 3.11
9194
uses: actions/setup-python@v5
9295
with:
93-
python-version: "3.13"
96+
python-version: "3.11"
9497

9598
- name: Cache pip dependencies
9699
id: cache

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,6 @@ test = [
9595
"coverage>=6.5, <8",
9696
"codecov",
9797
"pytest-cov",
98-
"pytest-spark",
9998
"nbval",
10099
"pyarrow",
101100
"twine>=3.1.1",

src/ydata_profiling/model/correlations.py

Lines changed: 50 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import numpy as np
66
import pandas as pd
7-
from multimethod import multimethod
87

98
from ydata_profiling.config import Settings
109

@@ -14,52 +13,70 @@
1413
from pandas.errors import DataError
1514

1615

16+
class CorrelationBackend:
17+
"""Helper class to select and cache the appropriate correlation backend (Pandas or Spark)."""
18+
19+
def __init__(self, df: Sized):
20+
"""Determine backend once and store it for all correlation computations."""
21+
if isinstance(df, pd.DataFrame):
22+
from ydata_profiling.model.pandas import (
23+
correlations_pandas as correlation_backend, #type: ignore
24+
)
25+
else:
26+
from ydata_profiling.model.spark import (
27+
correlations_spark as correlation_backend, # type: ignore
28+
)
29+
30+
self.backend = correlation_backend
31+
32+
def get_method(self, method_name: str):
33+
"""Retrieve the appropriate correlation method class from the backend."""
34+
if hasattr(self.backend, method_name):
35+
return getattr(self.backend, method_name)
36+
raise AttributeError(
37+
f"Correlation method '{method_name}' is not available in the backend."
38+
)
39+
40+
1741
class Correlation:
18-
@staticmethod
19-
def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
20-
raise NotImplementedError()
42+
_method_name: str = ""
43+
44+
def compute(
45+
self, config: Settings, df: Sized, summary: dict, backend: CorrelationBackend
46+
) -> Optional[Sized]:
47+
"""Computes correlation using the correct backend (Pandas or Spark)."""
48+
try:
49+
method = backend.get_method(self._method_name)
50+
except AttributeError as ex:
51+
raise NotImplementedError() from ex
52+
else:
53+
return method(config, df, summary)
2154

2255

2356
class Auto(Correlation):
24-
@staticmethod
25-
@multimethod
26-
def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
27-
raise NotImplementedError()
57+
"""Automatically selects the appropriate correlation method based on the DataFrame type."""
58+
59+
_method_name = "auto_compute"
2860

2961

3062
class Spearman(Correlation):
31-
@staticmethod
32-
@multimethod
33-
def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
34-
raise NotImplementedError()
63+
_method_name = "spearman_compute"
3564

3665

3766
class Pearson(Correlation):
38-
@staticmethod
39-
@multimethod
40-
def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
41-
raise NotImplementedError()
67+
_method_name = "pearson_compute"
4268

4369

4470
class Kendall(Correlation):
45-
@staticmethod
46-
@multimethod
47-
def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
48-
raise NotImplementedError()
71+
_method_name = "kendall_compute"
4972

5073

5174
class Cramers(Correlation):
52-
@staticmethod
53-
@multimethod
54-
def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
55-
raise NotImplementedError()
75+
_method_name = "cramers_compute"
5676

5777

5878
class PhiK(Correlation):
59-
@staticmethod
60-
@multimethod
61-
def compute(config: Settings, df: Sized, summary: dict) -> Optional[Sized]:
62-
raise NotImplementedError()
79+
_method_name = "phik_compute"
6380

6481

6582
def warn_correlation(correlation_name: str, error: str) -> None:
@@ -88,6 +105,8 @@ def calculate_correlation(
88105
Returns:
89106
The correlation matrices for the given correlation measures. Return None if correlation is empty.
90107
"""
108+
backend = CorrelationBackend(df)
109+
91110
correlation_measures = {
92111
"auto": Auto,
93112
"pearson": Pearson,
@@ -99,16 +118,13 @@ def calculate_correlation(
99118

100119
correlation = None
101120
try:
102-
correlation = correlation_measures[correlation_name].compute(
103-
config, df, summary
121+
correlation = correlation_measures[correlation_name]().compute(
122+
config, df, summary, backend
104123
)
105124
except (ValueError, AssertionError, TypeError, DataError, IndexError) as e:
106125
warn_correlation(correlation_name, str(e))
107126

108-
if correlation is not None and len(correlation) <= 0:
109-
correlation = None
110-
111-
return correlation
127+
return correlation if correlation is not None and len(correlation) > 0 else None
112128

113129

114130
def perform_check_correlation(
Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,34 @@
1+
import importlib
12
from typing import Any
23

3-
from multimethod import multimethod
4+
import pandas as pd
45

5-
from ydata_profiling.config import Settings
6+
spec = importlib.util.find_spec("pyspark")
7+
if spec is None:
8+
from typing import TypeVar
69

10+
sparkDataFrame = TypeVar("sparkDataFrame")
11+
else:
12+
from pyspark.sql import DataFrame as sparkDataFrame # type: ignore
13+
from ydata_profiling.model.spark.dataframe_spark import spark_preprocess
714

8-
@multimethod
9-
def check_dataframe(df: Any) -> None:
10-
raise NotImplementedError()
15+
from ydata_profiling.config import Settings
16+
from ydata_profiling.model.pandas.dataframe_pandas import pandas_preprocess
1117

1218

13-
@multimethod
1419
def preprocess(config: Settings, df: Any) -> Any:
20+
"""
21+
Search for invalid columns datatypes as well as ensures column names follow the expected rules
22+
Args:
23+
config: ydataprofiling Settings class
24+
df: a pandas or spark dataframe
25+
26+
Returns: a pandas or spark dataframe
27+
"""
28+
if isinstance(df, pd.DataFrame):
29+
df = pandas_preprocess(config=config, df=df)
30+
elif isinstance(df, sparkDataFrame): # type: ignore
31+
df = spark_preprocess(config=config, df=df)
32+
else:
33+
return NotImplementedError()
1534
return df

src/ydata_profiling/model/describe.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Organize the calculation of statistics for each series in this DataFrame."""
22
from datetime import datetime
3-
from typing import Any, Dict, Optional
3+
from typing import Any, Dict, Optional, Union
44

55
import pandas as pd
66
from tqdm.auto import tqdm
@@ -13,7 +13,7 @@
1313
calculate_correlation,
1414
get_active_correlations,
1515
)
16-
from ydata_profiling.model.dataframe import check_dataframe, preprocess
16+
from ydata_profiling.model.dataframe import preprocess
1717
from ydata_profiling.model.description import TimeIndexAnalysis
1818
from ydata_profiling.model.duplicates import get_duplicates
1919
from ydata_profiling.model.missing import get_missing_active, get_missing_diagram
@@ -29,11 +29,11 @@
2929

3030
def describe(
3131
config: Settings,
32-
df: pd.DataFrame,
32+
df: Union[pd.DataFrame, "pyspark.sql.DataFrame"], # type: ignore
3333
summarizer: BaseSummarizer,
3434
typeset: VisionsTypeset,
3535
sample: Optional[dict] = None,
36-
) -> BaseDescription:
36+
) -> BaseDescription: # noqa: TC301
3737
"""Calculate the statistics for each series in this DataFrame.
3838
3939
Args:
@@ -52,11 +52,26 @@ def describe(
5252
- alerts: direct special attention to these patterns in your data.
5353
- package: package details.
5454
"""
55+
# ** Validate Input types **
56+
if not isinstance(config, Settings):
57+
raise TypeError(f"`config` must be of type `Settings`, got {type(config)}")
58+
59+
# Validate df input type
60+
61+
if not isinstance(df, pd.DataFrame):
62+
try:
63+
from pyspark.sql import DataFrame as SparkDataFrame # type: ignore
64+
65+
if not isinstance(df, SparkDataFrame): # noqa: TC301
66+
raise TypeError( # noqa: TC301
67+
f"`df` must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}."
68+
)
69+
except ImportError as ex:
70+
raise TypeError(
71+
f"`df must be either a `pandas.DataFrame` or a `pyspark.sql.DataFrame`, but got {type(df)}."
72+
f"If using Spark, make sure PySpark is installed."
73+
) from ex
5574

56-
if df is None:
57-
raise ValueError("Can not describe a `lazy` ProfileReport without a DataFrame.")
58-
59-
check_dataframe(df)
6075
df = preprocess(config, df)
6176

6277
number_of_tasks = 5

src/ydata_profiling/model/handler.py

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
from functools import reduce
1+
"""
2+
Auxiliary handler methods for data summary extraction
3+
"""
24
from typing import Any, Callable, Dict, List, Sequence
35

46
import networkx as nx
@@ -7,22 +9,19 @@
79

810
def compose(functions: Sequence[Callable]) -> Callable:
911
"""
10-
Compose a sequence of functions
12+
Compose a sequence of functions.
13+
1114
:param functions: sequence of functions
12-
:return: combined functions, e.g. [f(x), g(x)] -> g(f(x))
15+
:return: combined function applying all functions in order.
1316
"""
1417

15-
def func(f: Callable, g: Callable) -> Callable:
16-
def func2(*x) -> Any:
17-
res = g(*x)
18-
if type(res) == bool:
19-
return f(*x)
20-
else:
21-
return f(*res)
22-
23-
return func2
18+
def composed_function(*args):
19+
result = args # Start with the input arguments
20+
for func in functions:
21+
result = func(*result) if isinstance(result, tuple) else func(result)
22+
return result
2423

25-
return reduce(func, reversed(functions), lambda *x: x)
24+
return composed_function
2625

2726

2827
class Handler:
@@ -40,7 +39,6 @@ def __init__(
4039
):
4140
self.mapping = mapping
4241
self.typeset = typeset
43-
4442
self._complete_dag()
4543

4644
def _complete_dag(self) -> None:
@@ -53,13 +51,13 @@ def _complete_dag(self) -> None:
5351

5452
def handle(self, dtype: str, *args, **kwargs) -> dict:
5553
"""
56-
5754
Returns:
58-
object:
55+
object: a tuple containing the config, the dataset series and the summary extracted
5956
"""
6057
funcs = self.mapping.get(dtype, [])
6158
op = compose(funcs)
62-
return op(*args)
59+
summary = op(*args)[-1]
60+
return summary
6361

6462

6563
def get_render_map() -> Dict[str, Callable]:

0 commit comments

Comments
 (0)