Skip to content

Commit 3fa69a9

Browse files
committed
add spark description dataclass
1 parent 52b7f2f commit 3fa69a9

4 files changed

Lines changed: 129 additions & 0 deletions

File tree

src/ydata_profiling/model/pandas/var_description/counts_pandas.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,4 +67,5 @@ def __init__(self, config: Settings, series: pd.Series):
6767
p_missing=series.isna().sum() / length if length > 0 else 0,
6868
count=length - series.isna().sum(),
6969
memory_size=series.memory_usage(deep=config.memory_deep),
70+
value_counts=None,
7071
)

src/ydata_profiling/model/pandas/var_description/default_pandas.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def from_var_counts(
6565
n_unique=unique_count,
6666
p_unique=unique_count / _count if _count > 0 else 0,
6767
var_specific=init_dict,
68+
value_counts=var_counts.value_counts,
6869
)
6970

7071

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
from pyspark.sql import DataFrame
2+
3+
from ydata_profiling.config import Settings
4+
from ydata_profiling.model.var_description.counts import VarCounts
5+
6+
7+
class VarCountsSpark(VarCounts):
8+
value_counts_without_nan: DataFrame
9+
"""Counts of values in the series without NaN."""
10+
value_counts_index_sorted: DataFrame
11+
"""Sorted counts of values in the series without NaN."""
12+
value_counts: DataFrame
13+
14+
def __init__(self, config: Settings, series: DataFrame):
15+
"""Counts the values in a series (with and without NaN, distinct).
16+
17+
Args:
18+
config: report Settings object
19+
series: Series for which we want to calculate the values.
20+
summary: series' summary
21+
22+
Returns:
23+
A dictionary with the count values (with and without NaN, distinct).
24+
"""
25+
length = series.count()
26+
27+
value_counts = series.groupBy(series.columns).count()
28+
value_counts = value_counts.sort("count", ascending=False).persist()
29+
value_counts_index_sorted = value_counts.sort(series.columns[0], ascending=True)
30+
31+
n_missing = value_counts.where(value_counts[series.columns[0]].isNull()).first()
32+
if n_missing is None:
33+
n_missing = 0
34+
else:
35+
n_missing = n_missing["count"]
36+
37+
# FIXME: reduce to top-n and bottom-n
38+
value_counts_index_sorted = (
39+
value_counts_index_sorted.limit(200)
40+
.toPandas()
41+
.set_index(series.columns[0], drop=True)
42+
.squeeze(axis="columns")
43+
)
44+
45+
# this is necessary as freqtables requires value_counts_without_nan
46+
# to be a pandas series. However, if we try to get everything into
47+
# pandas we will definitly crash the server
48+
value_counts_without_nan = (
49+
value_counts.dropna()
50+
.limit(200)
51+
.toPandas()
52+
.set_index(series.columns[0], drop=True)
53+
.squeeze(axis="columns")
54+
)
55+
56+
# FIXME: This is not correct, but used to fulfil render expectations
57+
# @chanedwin
58+
memory_size = 0
59+
60+
self.value_counts = value_counts
61+
super().__init__(
62+
hashable=False,
63+
value_counts_without_nan=value_counts_without_nan,
64+
value_counts_index_sorted=value_counts_index_sorted,
65+
ordering=False,
66+
n_missing=n_missing,
67+
n=length,
68+
p_missing=n_missing / length,
69+
count=length - n_missing,
70+
memory_size=memory_size,
71+
value_counts=value_counts.persist(),
72+
)
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
5+
from pyspark.sql import DataFrame
6+
7+
from ydata_profiling.config import Settings
8+
from ydata_profiling.model.spark.var_description.counts_spark import VarCountsSpark
9+
from ydata_profiling.model.var_description.default import VarDescriptionHashable
10+
11+
12+
@dataclass
13+
class VarDescriptionSparkHashable(VarDescriptionHashable):
14+
"""Default description for pandas columns."""
15+
16+
@classmethod
17+
def from_var_counts(
18+
cls, var_counts: VarCountsSpark, init_dict: dict
19+
) -> VarDescriptionSparkHashable:
20+
"""Get a default description from a VarCountsPandas object."""
21+
22+
count = var_counts.count
23+
n_distinct = var_counts.value_counts.count()
24+
25+
p_distinct = n_distinct / count if count > 0 else 0
26+
27+
n_unique = var_counts.value_counts.where("count == 1").count()
28+
is_unique = n_unique == count
29+
p_unique = n_unique / count
30+
31+
return VarDescriptionSparkHashable(
32+
n=var_counts.n,
33+
count=var_counts.count,
34+
n_missing=var_counts.n_missing,
35+
p_missing=var_counts.p_missing,
36+
hashable=var_counts.hashable,
37+
memory_size=var_counts.memory_size,
38+
ordering=var_counts.ordering,
39+
value_counts_index_sorted=var_counts.value_counts_index_sorted,
40+
value_counts_without_nan=var_counts.value_counts_without_nan,
41+
var_specific=init_dict,
42+
is_unique=is_unique,
43+
n_unique=n_unique,
44+
n_distinct=n_distinct,
45+
p_distinct=p_distinct,
46+
p_unique=p_unique,
47+
value_counts=var_counts.value_counts,
48+
)
49+
50+
51+
def get_default_spark_description(
52+
config: Settings, series: DataFrame, init_dict: dict
53+
) -> VarDescriptionSparkHashable:
54+
_var_counts = VarCountsSpark(config, series)
55+
return VarDescriptionSparkHashable.from_var_counts(_var_counts, init_dict)

0 commit comments

Comments
 (0)