|
4 | 4 | from ydata_profiling.model.var_description.counts import VarCounts |
5 | 5 |
|
6 | 6 |
|
7 | | -class VarCountsSpark(VarCounts): |
8 | | - value_counts_without_nan: DataFrame |
9 | | - """Counts of values in the series without NaN.""" |
10 | | - value_counts_index_sorted: DataFrame |
11 | | - """Sorted counts of values in the series without NaN.""" |
12 | | - value_counts: DataFrame |
13 | | - |
14 | | - def __init__(self, config: Settings, series: DataFrame): |
15 | | - """Counts the values in a series (with and without NaN, distinct). |
16 | | -
|
17 | | - Args: |
18 | | - config: report Settings object |
19 | | - series: Series for which we want to calculate the values. |
20 | | - summary: series' summary |
21 | | -
|
22 | | - Returns: |
23 | | - A dictionary with the count values (with and without NaN, distinct). |
24 | | - """ |
25 | | - length = series.count() |
26 | | - |
27 | | - value_counts = series.groupBy(series.columns).count() |
28 | | - value_counts = value_counts.sort("count", ascending=False).persist() |
29 | | - value_counts_index_sorted = value_counts.sort(series.columns[0], ascending=True) |
30 | | - |
31 | | - n_missing = value_counts.where(value_counts[series.columns[0]].isNull()).first() |
32 | | - if n_missing is None: |
33 | | - n_missing = 0 |
34 | | - else: |
35 | | - n_missing = n_missing["count"] |
36 | | - |
37 | | - # FIXME: reduce to top-n and bottom-n |
38 | | - value_counts_index_sorted = ( |
39 | | - value_counts_index_sorted.limit(200) |
40 | | - .toPandas() |
41 | | - .set_index(series.columns[0], drop=True) |
42 | | - .squeeze(axis="columns") |
43 | | - ) |
44 | | - |
45 | | - # this is necessary as freqtables requires value_counts_without_nan |
46 | | - # to be a pandas series. However, if we try to get everything into |
47 | | - # pandas we will definitly crash the server |
48 | | - value_counts_without_nan = ( |
49 | | - value_counts.dropna() |
50 | | - .limit(200) |
51 | | - .toPandas() |
52 | | - .set_index(series.columns[0], drop=True) |
53 | | - .squeeze(axis="columns") |
54 | | - ) |
55 | | - |
56 | | - # FIXME: This is not correct, but used to fulfil render expectations |
57 | | - # @chanedwin |
58 | | - memory_size = 0 |
59 | | - |
60 | | - self.value_counts = value_counts |
61 | | - super().__init__( |
62 | | - hashable=False, |
63 | | - value_counts_without_nan=value_counts_without_nan, |
64 | | - value_counts_index_sorted=value_counts_index_sorted, |
65 | | - ordering=False, |
66 | | - n_missing=n_missing, |
67 | | - n=length, |
68 | | - p_missing=n_missing / length, |
69 | | - count=length - n_missing, |
70 | | - memory_size=memory_size, |
71 | | - value_counts=value_counts.persist(), |
72 | | - ) |
| 7 | +def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts: |
| 8 | + """Get a VarCounts object for a spark series.""" |
| 9 | + length = series.count() |
| 10 | + |
| 11 | + value_counts = series.groupBy(series.columns).count() |
| 12 | + value_counts = value_counts.sort("count", ascending=False).persist() |
| 13 | + value_counts_index_sorted = value_counts.sort(series.columns[0], ascending=True) |
| 14 | + |
| 15 | + n_missing = value_counts.where(value_counts[series.columns[0]].isNull()).first() |
| 16 | + if n_missing is None: |
| 17 | + n_missing = 0 |
| 18 | + else: |
| 19 | + n_missing = n_missing["count"] |
| 20 | + |
| 21 | + # FIXME: reduce to top-n and bottom-n |
| 22 | + value_counts_index_sorted = ( |
| 23 | + value_counts_index_sorted.limit(200) |
| 24 | + .toPandas() |
| 25 | + .set_index(series.columns[0], drop=True) |
| 26 | + .squeeze(axis="columns") |
| 27 | + ) |
| 28 | + |
| 29 | + # this is necessary as freqtables requires value_counts_without_nan |
| 30 | + # to be a pandas series. However, if we try to get everything into |
| 31 | + # pandas we will definitly crash the server |
| 32 | + value_counts_without_nan = ( |
| 33 | + value_counts.dropna() |
| 34 | + .limit(200) |
| 35 | + .toPandas() |
| 36 | + .set_index(series.columns[0], drop=True) |
| 37 | + .squeeze(axis="columns") |
| 38 | + ) |
| 39 | + |
| 40 | + # FIXME: This is not correct, but used to fulfil render expectations |
| 41 | + # @chanedwin |
| 42 | + memory_size = 0 |
| 43 | + |
| 44 | + return VarCounts( |
| 45 | + hashable=False, |
| 46 | + value_counts_without_nan=value_counts_without_nan, |
| 47 | + value_counts_index_sorted=value_counts_index_sorted, |
| 48 | + ordering=False, |
| 49 | + n_missing=n_missing, |
| 50 | + n=length, |
| 51 | + p_missing=n_missing / length, |
| 52 | + count=length - n_missing, |
| 53 | + memory_size=memory_size, |
| 54 | + value_counts=value_counts.persist(), |
| 55 | + ) |
0 commit comments