Skip to content

Commit b8069d8

Browse files
committed
feat: add spark support
1 parent d91400f commit b8069d8

19 files changed

Lines changed: 110 additions & 196 deletions

src/ydata_profiling/model/alerts.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,10 @@ def __init__(
170170
):
171171
super().__init__(
172172
alert_type=AlertType.CONSTANT,
173-
values={"n_distinct": values["n_distinct"]},
173+
values={
174+
"n_distinct": values["n_distinct"],
175+
"value_counts_without_nan": values.value_counts_without_nan,
176+
},
174177
column_name=column_name,
175178
is_empty=is_empty,
176179
)

src/ydata_profiling/model/pandas/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@
33
dataframe_pandas,
44
describe_boolean_pandas,
55
describe_categorical_pandas,
6-
describe_counts_pandas,
76
describe_date_pandas,
87
describe_file_pandas,
9-
describe_generic_pandas,
108
describe_image_pandas,
119
describe_numeric_pandas,
1210
describe_path_pandas,
@@ -27,10 +25,8 @@
2725
"dataframe_pandas",
2826
"describe_boolean_pandas",
2927
"describe_categorical_pandas",
30-
"describe_counts_pandas",
3128
"describe_date_pandas",
3229
"describe_file_pandas",
33-
"describe_generic_pandas",
3430
"describe_image_pandas",
3531
"describe_numeric_pandas",
3632
"describe_path_pandas",

src/ydata_profiling/model/pandas/describe_counts_pandas.py

Lines changed: 0 additions & 64 deletions
This file was deleted.

src/ydata_profiling/model/pandas/describe_date_pandas.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def pandas_describe_date_1d(
3030
Returns:
3131
A dict containing calculated series description values.
3232
"""
33-
if summary["value_counts_without_nan"].empty:
33+
if summary.value_counts_without_nan.empty:
3434
values = series.values
3535
summary.update(
3636
{

src/ydata_profiling/model/pandas/describe_generic_pandas.py

Lines changed: 0 additions & 37 deletions
This file was deleted.

src/ydata_profiling/model/spark/__init__.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@
33
dataframe_spark,
44
describe_boolean_spark,
55
describe_categorical_spark,
6-
describe_counts_spark,
76
describe_date_spark,
8-
describe_generic_spark,
97
describe_numeric_spark,
108
describe_supported_spark,
119
duplicates_spark,
@@ -21,9 +19,7 @@
2119
"dataframe_spark",
2220
"describe_boolean_spark",
2321
"describe_categorical_spark",
24-
"describe_counts_spark",
2522
"describe_date_spark",
26-
"describe_generic_spark",
2723
"describe_numeric_spark",
2824
"describe_supported_spark",
2925
"duplicates_spark",

src/ydata_profiling/model/spark/correlations_spark.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Correlations between variables."""
2+
23
from typing import Optional
34

45
import pandas as pd

src/ydata_profiling/model/spark/describe_boolean_spark.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
from ydata_profiling.config import Settings
66
from ydata_profiling.model.summary_algorithms import describe_boolean_1d
7+
from ydata_profiling.model.var_description.default import VarDescription
78

89

910
@describe_boolean_1d.register
1011
def describe_boolean_1d_spark(
11-
config: Settings, df: DataFrame, summary: dict
12-
) -> Tuple[Settings, DataFrame, dict]:
12+
config: Settings, df: DataFrame, summary: VarDescription
13+
) -> Tuple[Settings, DataFrame, VarDescription]:
1314
"""Describe a boolean series.
1415
1516
Args:
@@ -20,7 +21,7 @@ def describe_boolean_1d_spark(
2021
A dict containing calculated series description values.
2122
"""
2223

23-
value_counts = summary["value_counts"]
24+
value_counts = summary.value_counts
2425

2526
# get the most common boolean value and its frequency
2627
top = value_counts.first()

src/ydata_profiling/model/spark/describe_categorical_spark.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
from ydata_profiling.config import Settings
66
from ydata_profiling.model.summary_algorithms import describe_categorical_1d
7+
from ydata_profiling.model.var_description.default import VarDescription
78

89

910
@describe_categorical_1d.register
1011
def describe_categorical_1d_spark(
11-
config: Settings, df: DataFrame, summary: dict
12-
) -> Tuple[Settings, DataFrame, dict]:
12+
config: Settings, df: DataFrame, summary: VarDescription
13+
) -> Tuple[Settings, DataFrame, VarDescription]:
1314
"""Describe a categorical series.
1415
1516
Args:

src/ydata_profiling/model/spark/describe_date_spark.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from ydata_profiling.config import Settings
88
from ydata_profiling.model.summary_algorithms import describe_date_1d
9+
from ydata_profiling.model.var_description.default import VarDescription
910

1011

1112
def date_stats_spark(df: DataFrame, summary: dict) -> dict:
@@ -21,8 +22,8 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict:
2122

2223
@describe_date_1d.register
2324
def describe_date_1d_spark(
24-
config: Settings, df: DataFrame, summary: dict
25-
) -> Tuple[Settings, DataFrame, dict]:
25+
config: Settings, df: DataFrame, summary: VarDescription
26+
) -> Tuple[Settings, DataFrame, VarDescription]:
2627
"""Describe a date series.
2728
2829
Args:

0 commit comments

Comments
 (0)