Skip to content

Commit 3f126b0

Browse files
Fabiana Clementefabclmnt
authored andcommitted
chore: update metrics validation
1 parent 5bbd589 commit 3f126b0

3 files changed

Lines changed: 48 additions & 7 deletions

File tree

src/ydata_profiling/profile_report.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def __initialize_dataframe(
199199
) -> Optional[Union[pd.DataFrame, sDataFrame]]:
200200

201201
logger.info_def_report(
202-
dataframe=type(df), timeseries=report_config.vars.timeseries.active
202+
df=df, timeseries=report_config.vars.timeseries.active,
203203
)
204204

205205
if (

src/ydata_profiling/utils/common.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from pathlib import Path
1313
from typing import Mapping
1414

15+
import pandas as pd
1516
import requests
1617

1718
from ydata_profiling.version import __version__
@@ -98,7 +99,12 @@ def convert_timestamp_to_datetime(timestamp: int) -> datetime:
9899
return datetime(1970, 1, 1) + timedelta(seconds=int(timestamp))
99100

100101

101-
def analytics_features(dataframe: str, datatype: str, report_type: str) -> None:
102+
def analytics_features(dataframe: str,
103+
datatype: str,
104+
report_type: str,
105+
ncols: int,
106+
nrows:int,
107+
dbx: str) -> None:
102108
endpoint = "https://packages.ydata.ai/ydata-profiling?"
103109
package_version = __version__
104110

@@ -120,9 +126,36 @@ def analytics_features(dataframe: str, datatype: str, report_type: str) -> None:
120126
f"&python_version={python_version}"
121127
f"&report_type={report_type}"
122128
f"&dataframe={dataframe}"
129+
f"&ncols={ncols}"
130+
f"&nrows={nrows}"
123131
f"&datatype={datatype}"
124132
f"&os={platform.system()}"
125133
f"&gpu={str(gpu_present)}"
134+
f"&dbx={dbx}"
126135
)
127136

128137
requests.get(request_message)
138+
139+
def is_running_in_databricks():
140+
mask = 'DATABRICKS_RUNTIME_VERSION' in os.environ
141+
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
142+
return os.environ['DATABRICKS_RUNTIME_VERSION']
143+
else:
144+
return str(mask)
145+
146+
def calculate_nrows(df):
147+
"""
148+
Calculates the approx. number of rows spark dataframes
149+
150+
Returns: int, approximate number of rows
151+
"""
152+
try:
153+
n_partitions = df.rdd.getNumPartitions()
154+
155+
nrows = df.rdd.mapPartitionsWithIndex(
156+
lambda idx, partition: [sum(1 for _ in partition)] if idx == 0 else [0]
157+
).collect()[0] * n_partitions
158+
except:
159+
nrows = 0 # returns 0 in case it was not possible to compute it from the partition
160+
161+
return nrows

src/ydata_profiling/utils/logger.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,36 @@
66

77
import pandas as pd
88

9-
from ydata_profiling.utils.common import analytics_features
9+
from ydata_profiling.utils.common import (calculate_nrows,
10+
analytics_features,
11+
is_running_in_databricks)
1012

1113

1214
class ProfilingLogger(logging.Logger):
1315
def __init__(self, name: str, level: int = logging.INFO):
1416
super().__init__(name, level)
1517

16-
def info_def_report(self, dataframe, timeseries: bool) -> None: # noqa: ANN001
17-
if isinstance(dataframe, pd.DataFrame):
18+
def info_def_report(self, df, timeseries: bool) -> None: # noqa: ANN001
19+
ncols = len(df.columns)
20+
if isinstance(df, pd.DataFrame):
1821
dataframe = "pandas"
1922
report_type = "regular"
20-
elif dataframe is None:
23+
nrows=len(df)
24+
elif df is None:
2125
dataframe = "pandas"
2226
report_type = "compare"
27+
nrows=len(df)
2328
else:
2429
dataframe = "spark"
2530
report_type = "regular"
31+
nrows=calculate_nrows(df)
2632

33+
dbx=is_running_in_databricks()
2734
datatype = "timeseries" if timeseries else "tabular"
2835

2936
analytics_features(
30-
dataframe=dataframe, datatype=datatype, report_type=report_type
37+
dataframe=dataframe, datatype=datatype, report_type=report_type,
38+
nrows=nrows, ncols=ncols, dbx=dbx
3139
)
3240

3341
super().info(

0 commit comments

Comments
 (0)