Skip to content

Commit 7fb4fc5

Browse files
alexbarrosaquemy
authored andcommitted
fix: gap identification and improve gap statistics and visualization (#1423)
* fix: wording on time index tab * fix: improve gap identification * fix: add minutes and hours as intervals * fix: add the frequency to the stats tab * fix: tests * fix: linter issues * docs: added gap analysis info to the ts doc * feat: humanize timespan outputs * fix: parameter order * fix: std failing with only one gap * fix: remove unused import * fix: adjust the doc msg
1 parent 5bab571 commit 7fb4fc5

8 files changed

Lines changed: 64 additions & 71 deletions

File tree

docsrc/source/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
pages/use_cases/profiling_databases
2323
pages/use_cases/metadata
2424
pages/use_cases/custom_report_appearance
25+
pages/use_cases/time_series_datasets
2526

2627
.. toctree::
2728
:maxdepth: 3

docsrc/source/pages/use_cases/time_series_datasets.rst

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
Time-Series data
33
==================
44

5-
``pandas-profiling`` can be used for a quick Exploratory Data Analysis on time-series data. This is useful for a quick understading on the behaviour of time dependent variables regarding behaviours such as time plots, seasonality, trends and stationarity.
5+
``ydata-profiling`` can be used for a quick Exploratory Data Analysis on time-series data. This is useful for a quick understading on the behaviour of time dependent variables regarding behaviours such as time plots, seasonality, trends, stationarity and data gaps.
66

7-
Combined with the profiling reports compare, you're able to compare the evolution and data behaviour through time, in terms of time-series specific statistics such as PACF and ACF plots.
7+
Combined with the profiling reports compare, you're able to compare the evolution and data behaviour through time, in terms of time-series specific statistics such as PACF and ACF plots. It also provides the identification of gaps in the time series, caused either by missing values or by entries missing in the time index.
88

99
The following syntax can be used to generate a profile under the assumption that the dataset includes time dependent features:
1010

@@ -60,8 +60,6 @@ In some cases you might be already aware of what variables are expected to be ti
6060
"NO2 1st Max Value": "timeseries",
6161
"NO2 1st Max Hour": "timeseries",
6262
"NO2 AQI": "timeseries",
63-
"cos": "numeric",
64-
"cat": "numeric",
6563
}
6664
6765
profile = ProfileReport(

src/ydata_profiling/model/pandas/describe_timeseries_pandas.py

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from statsmodels.tsa.stattools import adfuller
88

99
from ydata_profiling.config import Settings
10-
from ydata_profiling.model.pandas.utils_pandas import get_period_and_frequency
1110
from ydata_profiling.model.summary_algorithms import (
1211
describe_numeric_1d,
1312
describe_timeseries_1d,
@@ -142,6 +141,25 @@ def get_fft_peaks(
142141
return threshold, orig_peaks, peaks
143142

144143

144+
def identify_gaps(
145+
gap: pd.Series, is_datetime: bool, gap_tolerance: int = 2
146+
) -> Tuple[pd.Series, list]:
147+
zero = pd.Timedelta(0) if is_datetime else 0
148+
diff = gap.diff()
149+
150+
non_zero_diff = diff[diff > zero]
151+
min_gap_size = gap_tolerance * non_zero_diff.mean()
152+
153+
gap_stats = non_zero_diff[non_zero_diff > min_gap_size]
154+
anchors = gap[diff > min_gap_size].index
155+
156+
gaps = []
157+
for i in anchors:
158+
gaps.append(gap.loc[gap.index[[i - 1, i]]].values)
159+
160+
return gap_stats, gaps
161+
162+
145163
def compute_gap_stats(series: pd.Series) -> pd.Series:
146164
"""Computes the intertevals in the series normalized by the period.
147165
@@ -157,31 +175,17 @@ def compute_gap_stats(series: pd.Series) -> pd.Series:
157175
gap = gap.reset_index()[index_name]
158176
gap.index.name = None
159177

160-
if isinstance(series.index, pd.DatetimeIndex):
161-
period, frequency = get_period_and_frequency(series.index)
162-
period = pd.Timedelta(f"{period} {frequency}")
163-
base_frequency = pd.Timedelta(f"1 {frequency}")
164-
else:
165-
period = np.abs(np.diff(series.index)).mean()
166-
base_frequency = 1
167-
168-
diff = gap.diff()
169-
anchors = gap[diff > 2 * period].index
170-
gaps = []
171-
for i in anchors:
172-
gaps.append(gap.loc[gap.index[[i - 1, i]]].values)
178+
is_datetime = isinstance(series.index, pd.DatetimeIndex)
179+
gap_stats, gaps = identify_gaps(gap, is_datetime)
173180

174181
stats = {
175-
"period": period / base_frequency,
176-
"min": diff.min() / base_frequency,
177-
"max": diff.max() / base_frequency,
178-
"mean": diff.mean() / base_frequency,
179-
"std": diff.std() / base_frequency,
182+
"min": gap_stats.min(),
183+
"max": gap_stats.max(),
184+
"mean": gap_stats.mean(),
185+
"std": gap_stats.std() if len(gap_stats) > 1 else 0,
180186
"series": series,
181187
"gaps": gaps,
182188
}
183-
if isinstance(series.index, pd.DatetimeIndex):
184-
stats["frequency"] = frequency
185189
return stats
186190

187191

src/ydata_profiling/model/pandas/timeseries_index_pandas.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from pandas.api.types import is_numeric_dtype
55

66
from ydata_profiling.config import Settings
7-
from ydata_profiling.model.pandas.utils_pandas import get_period_and_frequency
87
from ydata_profiling.model.timeseries_index import get_time_index_description
98

109

@@ -21,17 +20,14 @@ def pandas_get_time_index_description(
2120
length = table_stats["n"]
2221
start = df.index.min()
2322
end = df.index.max()
23+
period = abs(np.diff(df.index)).mean()
2424
if isinstance(df.index, pd.DatetimeIndex):
25-
period, freq = get_period_and_frequency(df.index)
26-
else:
27-
freq = None
28-
period = abs(np.diff(df.index)).mean()
25+
period = pd.Timedelta(period)
2926

3027
return {
3128
"n_series": n_series,
3229
"length": length,
3330
"start": start,
3431
"end": end,
35-
"frequency": freq,
3632
"period": period,
3733
}
Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
from typing import Tuple
2-
31
import numpy as np
4-
import pandas as pd
52

63

74
def weighted_median(data: np.ndarray, weights: np.ndarray) -> int:
@@ -28,21 +25,3 @@ def weighted_median(data: np.ndarray, weights: np.ndarray) -> int:
2825
else:
2926
w_median = s_data[idx + 1]
3027
return w_median
31-
32-
33-
def get_period_and_frequency(index: pd.DatetimeIndex) -> Tuple[float, str]:
34-
delta = abs(np.diff(index)).mean()
35-
delta = pd.Timedelta(delta)
36-
if delta.days > 0:
37-
frequency = "Days"
38-
period = delta / pd.Timedelta(days=1)
39-
elif delta.seconds > 0:
40-
frequency = "Seconds"
41-
period = delta / pd.Timedelta(seconds=1)
42-
elif delta.microseconds > 0:
43-
frequency = "Microseconds"
44-
period = delta / pd.Timedelta(microseconds=1)
45-
else:
46-
frequency = "Nanoseconds"
47-
period = delta.nanoseconds / pd.Timedelta(nanoseconds=1)
48-
return period, frequency

src/ydata_profiling/report/formatters.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Any, Callable, Dict, List, Optional, Union
77

88
import numpy as np
9+
import pandas as pd
910
from markupsafe import escape
1011

1112

@@ -218,6 +219,20 @@ def pluralize(count: Any, singular: str, plural: Optional[str] = None) -> str:
218219
return concatenate(result)
219220

220221

222+
def fmt_timespan_timedelta(
223+
delta: Any, detailed: bool = False, max_units: int = 3, precision: int = 10
224+
) -> str:
225+
if isinstance(delta, pd.Timedelta):
226+
num_seconds = delta.total_seconds()
227+
if delta.microseconds > 0:
228+
num_seconds += delta.microseconds * 1e-6
229+
if delta.nanoseconds > 0:
230+
num_seconds += delta.nanoseconds * 1e-9
231+
return fmt_timespan(num_seconds, detailed, max_units)
232+
else:
233+
return fmt_numeric(delta, precision)
234+
235+
221236
@list_args
222237
def fmt_numeric(value: float, precision: int = 10) -> str:
223238
"""Format any numeric value.

src/ydata_profiling/report/structure/overview.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
fmt_numeric,
1414
fmt_percent,
1515
fmt_timespan,
16+
fmt_timespan_timedelta,
1617
list_args,
1718
)
1819
from ydata_profiling.report.presentation.core import Alerts, Container
@@ -298,18 +299,10 @@ def format_tsindex_limit(limit: Any) -> str:
298299
},
299300
{
300301
"name": "Period",
301-
"value": fmt_number(summary.time_index_analysis.period),
302+
"value": fmt_timespan_timedelta(summary.time_index_analysis.period),
302303
},
303304
]
304305

305-
if summary.time_index_analysis.frequency:
306-
table_stats.append(
307-
{
308-
"name": "Frequency",
309-
"value": summary.time_index_analysis.frequency,
310-
}
311-
)
312-
313306
ts_info = Table(table_stats, name="Timeseries statistics", style=config.html.style)
314307

315308
dpi_bak = config.plot.dpi
@@ -318,14 +311,14 @@ def format_tsindex_limit(limit: Any) -> str:
318311
plot_overview_timeseries(config, summary.variables),
319312
image_format=config.plot.image_format,
320313
alt="ts_plot",
321-
name="original",
314+
name="Original",
322315
anchor_id="ts_plot_overview",
323316
)
324317
timeseries_scaled = ImageWidget(
325318
plot_overview_timeseries(config, summary.variables, scale=True),
326319
image_format=config.plot.image_format,
327320
alt="ts_plot_scaled",
328-
name="scaled",
321+
name="Scaled",
329322
anchor_id="ts_plot_scaled_overview",
330323
)
331324
config.plot.dpi = dpi_bak

src/ydata_profiling/report/structure/variables/render_timeseries.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
fmt_monotonic,
66
fmt_numeric,
77
fmt_percent,
8+
fmt_timespan_timedelta,
89
)
910
from ydata_profiling.report.presentation.core import (
1011
Container,
@@ -25,34 +26,40 @@
2526
def _render_gap_tab(config: Settings, summary: dict) -> Container:
2627
gap_stats = [
2728
{
28-
"name": "min inverval",
29+
"name": "number of gaps",
2930
"value": fmt_numeric(
31+
len(summary["gap_stats"]["gaps"]), precision=config.report.precision
32+
),
33+
},
34+
{
35+
"name": "min",
36+
"value": fmt_timespan_timedelta(
3037
summary["gap_stats"]["min"], precision=config.report.precision
3138
),
3239
},
3340
{
34-
"name": "max inverval",
35-
"value": fmt_numeric(
41+
"name": "max",
42+
"value": fmt_timespan_timedelta(
3643
summary["gap_stats"]["max"], precision=config.report.precision
3744
),
3845
},
3946
{
40-
"name": "mean inverval",
41-
"value": fmt_numeric(
47+
"name": "mean",
48+
"value": fmt_timespan_timedelta(
4249
summary["gap_stats"]["mean"], precision=config.report.precision
4350
),
4451
},
4552
{
46-
"name": "interval std",
47-
"value": fmt_numeric(
53+
"name": "std",
54+
"value": fmt_timespan_timedelta(
4855
summary["gap_stats"]["std"], precision=config.report.precision
4956
),
5057
},
5158
]
5259

5360
gap_table = Table(
5461
gap_stats,
55-
name="Intervals statistics",
62+
name="Gap statistics",
5663
style=config.html.style,
5764
)
5865

0 commit comments

Comments
 (0)