Skip to content

Commit 6d6fee4

Browse files
Feature/better metrics (#86)
1 parent 86cfaf4 commit 6d6fee4

26 files changed

Lines changed: 256 additions & 148 deletions

dbt_project.yml

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -29,28 +29,34 @@ vars:
2929

3030
re_data:store_table_samples: false
3131

32-
re_data:metrics_base:
33-
table:
34-
- row_count
35-
- freshness
36-
37-
column:
38-
numeric:
39-
- min
40-
- max
41-
- avg
42-
- stddev
43-
- variance
44-
- nulls_count
45-
- nulls_percent
46-
text:
47-
- min_length
48-
- max_length
49-
- avg_length
50-
- nulls_count
51-
- missing_count
52-
- nulls_percent
53-
- missing_percent
32+
re_data:metrics_groups:
33+
table_metrics:
34+
table:
35+
- row_count
36+
- freshness
37+
38+
column_metrics:
39+
column:
40+
numeric:
41+
- min
42+
- max
43+
- avg
44+
- stddev
45+
- variance
46+
- nulls_count
47+
- nulls_percent
48+
text:
49+
- min_length
50+
- max_length
51+
- avg_length
52+
- nulls_count
53+
- missing_count
54+
- nulls_percent
55+
- missing_percent
56+
57+
re_data:default_metrics:
58+
- table_metrics
59+
- column_metrics
5460

5561
models:
5662
re_data:

integration_tests/dbt_project.yml

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,43 +18,48 @@ sources:
1818
+re_data_time_filter: null
1919

2020
vars:
21+
re_data:store_table_samples: true
2122
re_data:anomaly_detector:
2223
name: modified_z_score
2324
threshold: 0.6
2425
re_data:max_columns_in_query: 1
2526

2627
re_data:select:
27-
- buy_events
28+
- tag:testtag
2829
- sample_table
2930
- sample_without_time_filter
3031
- sample_with_anomaly
3132
- re_data_source_test_table
3233

33-
re_data:metrics_base:
34-
table:
35-
- row_count
36-
- freshness
37-
- my_distinct_table_rows
34+
re_data:metrics_groups:
35+
integration_test_group:
36+
table:
37+
- row_count
38+
- freshness
39+
- my_distinct_table_rows
3840

39-
column:
40-
numeric:
41-
- min
42-
- max
43-
- avg
44-
- stddev
45-
- variance
46-
- nulls_count
47-
- nulls_percent
48-
- diff # my own custom metric
41+
column:
42+
numeric:
43+
- min
44+
- max
45+
- avg
46+
- stddev
47+
- variance
48+
- nulls_count
49+
- nulls_percent
50+
- diff # my own custom metric
4951

50-
text:
51-
- min_length
52-
- max_length
53-
- avg_length
54-
- nulls_count
55-
- nulls_percent
56-
- missing_percent
57-
- missing_count
52+
text:
53+
- min_length
54+
- max_length
55+
- avg_length
56+
- nulls_count
57+
- nulls_percent
58+
- missing_percent
59+
- missing_count
60+
61+
re_data:default_metrics:
62+
- integration_test_group
5863

5964
seeds:
6065
+schema: seeds

integration_tests/models/monitoring/expected_test_history.csv

Lines changed: 0 additions & 8 deletions
This file was deleted.

integration_tests/models/monitoring/schema.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,9 @@ models:
1919
- name: test_re_data_test_history
2020
tests:
2121
- dbt_utils.equality:
22-
compare_model: ref('expected_test_history')
22+
compare_model: ref('expected_test_history')
23+
24+
- name: test_re_data_table_samples
25+
tests:
26+
- dbt_utils.equality:
27+
compare_model: ref('expected_table_samples')
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
2+
select
3+
{{ clean_table_name('table_name') }} as table_name,
4+
length(sample_data) as sample_data_length
5+
from {{ ref('re_data_table_samples') }}
6+
where {{ clean_table_name('table_name') }} != 'SAMPLE_WITHOUT_TIME_FILTER'
7+
8+
-- SAMPLE_WITHOUT_TIME_FILTER because this table doesn't have a time filter, it's not possible to say how
9+
-- exactly the sampel of it should look like.

integration_tests/models/transformed/buy_events.sql

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
{{
2-
config(re_data_monitored=true, re_data_time_filter='creation_time', materialized='table',
3-
re_data_anomaly_detector={'name': 'z_score', 'threshold': 0.5},)
2+
config(
3+
re_data_monitored=true,
4+
re_data_time_filter='creation_time',
5+
re_data_anomaly_detector={'name': 'z_score', 'threshold': 0.5},
6+
materialized='table',
7+
tags=['testtag']
8+
)
49
}}
510
select *
611
from {{ ref('sample_with_anomaly') }}

integration_tests/python_tests/test_monitoring.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,9 @@ def test_monitoring(db, source_schema):
5151
'dbt run --select monitoring.*', db, dbt_vars
5252
)
5353

54-
dbt_test('--select test_re_data_anomalies test_re_data_metrics test_re_data_z_score re_data_metrics transformed', db, dbt_vars)
54+
dbt_test('--select test_re_data_anomalies test_re_data_metrics test_re_data_z_score test_re_data_table_samples re_data_metrics transformed', db, dbt_vars)
5555
# dbt build will "duplicate" saved test result history
56-
dbt_build('--select test_re_data_anomalies test_re_data_metrics test_re_data_z_score re_data_metrics transformed', db, dbt_vars)
56+
dbt_build('--select test_re_data_anomalies test_re_data_metrics test_re_data_z_score test_re_data_table_samples re_data_metrics transformed', db, dbt_vars)
5757

5858
# tests test_history seperately, because those are actually added to DB after running
5959
# dbt test command
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
table_name,sample_data_length
2+
BUY_EVENTS,506
3+
RE_DATA_SOURCE_TEST_TABLE,361
4+
SAMPLE_TABLE,830
5+
SAMPLE_WITH_ANOMALY,507
Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,35 @@
11
table_name,column_name,test_name,status,message,failures_count,severity
2-
BUY_EVENTS,---,ts_row_count__1,Pass,---,0,ERROR
32
TEST_RE_DATA_Z_SCORE,---,pected_z_score_,Pass,---,0,ERROR
4-
TEST_RE_DATA_METRICS,---,pected_metrics_,Pass,---,0,ERROR
53
TEST_RE_DATA_ANOMALIES,---,cted_anomalies_,Pass,---,0,ERROR
6-
BUY_EVENTS,---,null__freshness,Pass,---,0,ERROR
4+
TEST_RE_DATA_TABLE_SAMPLES,---,_table_samples_,Pass,---,0,ERROR
5+
TEST_RE_DATA_METRICS,---,pected_metrics_,Pass,---,0,ERROR
6+
BUY_EVENTS,---,ts_row_count__1,Pass,---,0,ERROR
77
BUY_EVENTS,VALUE2,alue2__min__200,Pass,---,0,ERROR
8-
BUY_EVENTS,---,0__row_count__0,Pass,---,0,ERROR
8+
BUY_EVENTS,---,null__freshness,Pass,---,0,ERROR
99
BUY_EVENTS,---,_table_rows__10,Pass,---,0,ERROR
10-
BUY_EVENTS,VALUE1,ulls_percent__0,Pass,---,0,ERROR
1110
BUY_EVENTS,VALUE1,alue1__min__100,Pass,---,0,ERROR
12-
BUY_EVENTS,VALUE1,__nulls_percent,Pass,---,0,ERROR
11+
BUY_EVENTS,---,0__row_count__0,Pass,---,0,ERROR
1312
BUY_EVENTS,VALUE1,alue1__min__107,Pass,---,0,ERROR
13+
BUY_EVENTS,VALUE1,ulls_percent__0,Pass,---,0,ERROR
14+
BUY_EVENTS,VALUE1,__nulls_percent,Pass,---,0,ERROR
1415
RE_DATA_METRICS,---,_buy_events___3,Pass,---,0,ERROR
1516
RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
1617
RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
1718
RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
18-
BUY_EVENTS,---,ts_row_count__1,Pass,---,0,ERROR
19-
TEST_RE_DATA_Z_SCORE,---,pected_z_score_,Pass,---,0,ERROR
20-
TEST_RE_DATA_METRICS,---,pected_metrics_,Pass,---,0,ERROR
21-
TEST_RE_DATA_ANOMALIES,---,cted_anomalies_,Pass,---,0,ERROR
2219
BUY_EVENTS,---,null__freshness,Pass,---,0,ERROR
20+
BUY_EVENTS,---,ts_row_count__1,Pass,---,0,ERROR
21+
BUY_EVENTS,---,_table_rows__10,Pass,---,0,ERROR
2322
BUY_EVENTS,VALUE2,alue2__min__200,Pass,---,0,ERROR
2423
BUY_EVENTS,---,0__row_count__0,Pass,---,0,ERROR
25-
BUY_EVENTS,---,_table_rows__10,Pass,---,0,ERROR
2624
BUY_EVENTS,VALUE1,ulls_percent__0,Pass,---,0,ERROR
27-
BUY_EVENTS,VALUE1,alue1__min__100,Pass,---,0,ERROR
28-
BUY_EVENTS,VALUE1,__nulls_percent,Pass,---,0,ERROR
2925
BUY_EVENTS,VALUE1,alue1__min__107,Pass,---,0,ERROR
26+
BUY_EVENTS,VALUE1,alue1__min__100,Pass,---,0,ERROR
3027
RE_DATA_METRICS,---,_buy_events___3,Pass,---,0,ERROR
28+
BUY_EVENTS,VALUE1,__nulls_percent,Pass,---,0,ERROR
29+
RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
3130
RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
31+
TEST_RE_DATA_TABLE_SAMPLES,---,_table_samples_,Pass,---,0,ERROR
3232
RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
33-
RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
33+
TEST_RE_DATA_ANOMALIES,---,cted_anomalies_,Pass,---,0,ERROR
34+
TEST_RE_DATA_Z_SCORE,---,pected_z_score_,Pass,---,0,ERROR
35+
TEST_RE_DATA_METRICS,---,pected_metrics_,Pass,---,0,ERROR

macros/config/get_model_config.sql

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
2+
{% macro get_model_config(monitored) %}
3+
{% set model = {} %}
4+
{% do model.update({'name': re_data.row_value(monitored, 'name')}) %}
5+
{% do model.update({'schema': re_data.row_value(monitored, 'schema')}) %}
6+
{% do model.update({'database': re_data.row_value(monitored, 'database')}) %}
7+
{% do model.update({'time_filter': re_data.row_value(monitored, 'time_filter')}) %}
8+
{% do model.update({'metrics': fromjson(re_data.row_value(monitored, 'metrics'))}) %}
9+
{% do model.update({'model_name': model.get('database') + '.' + model.get('schema') + '.' + model.get('name')}) %}
10+
{% do model.update({'table_name': full_table_name_values(model.get('name'), model.get('schema'), model.get('database'))}) %}
11+
12+
{% set columns_db = re_data.row_value(monitored, 'columns') %}
13+
14+
{% set column_list = fromjson(columns_db) if columns_db is not none else none %}
15+
{% set columns_dict = re_data.dict_from_list(column_list) %}
16+
17+
{% do model.update({'columns_dict': columns_dict}) %}
18+
{% do model.update({'columns_compute_all': columns_dict is none}) %}
19+
20+
{% set columns_query %}
21+
select * from {{ ref('re_data_columns') }}
22+
where name = '{{ model.name }}' and schema = '{{ model.schema }}' and database = '{{ model.database }}'
23+
{% endset %}
24+
{% set columns = run_query(columns_query) %}
25+
26+
{% set columns_info = {} %}
27+
{% for col in columns %}
28+
{% set column_name = re_data.row_value(col, 'column_name') %}
29+
{% set data_type = re_data.get_column_type(col) %}
30+
{% do columns_info.update({column_name: { 'data_type': data_type }}) %}
31+
{% endfor %}
32+
33+
{% do model.update({'columns_info': columns_info}) %}
34+
{% do model.update({'columns': columns}) %}
35+
36+
{{ return(model) }}
37+
{% endmacro %}
38+
39+
{% macro should_compute_metric(model, column_name) %}
40+
{{ return(model.columns_compute_all or model.columns_dict.get(column_name)) }}
41+
{% endmacro %}

0 commit comments

Comments
 (0)