This repository was archived by the owner on Aug 8, 2025. It is now read-only.
File tree Expand file tree Collapse file tree
docker/mount/datacontract-cli
src/main/scala/io/github/datacatering/plan Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1+ dataContractSpecification : 0.9.3
2+ id : covid_cases
3+ info :
4+ title : COVID-19 cases
5+ description : Johns Hopkins University Consolidated data on COVID-19 cases, sourced from Enigma
6+ version : " 0.0.1"
7+ links :
8+ blog : https://aws.amazon.com/blogs/big-data/a-public-data-lake-for-analysis-of-covid-19-data/
9+ data-explorer : https://dj2taa9i652rf.cloudfront.net/
10+ data : https://covid19-lake.s3.us-east-2.amazonaws.com/enigma-jhu/json/part-00000-adec1cd2-96df-4c6b-a5f2-780f092951ba-c000.json
11+ servers :
12+ s3-json :
13+ type : s3
14+ location : s3://covid19-lake/enigma-jhu/json/*.json
15+ format : json
16+ delimiter : new_line
17+ models :
18+ covid_cases :
19+ description : the number of confirmed covid cases reported for a specified region, with location and county/province/country information.
20+ fields :
21+ fips :
22+ type : string
23+ description : state and county two digits code
24+ admin2 :
25+ type : string
26+ description : county name
27+ province_state :
28+ type : string
29+ description : province name or state name
30+ country_region :
31+ type : string
32+ description : country name or region name
33+ last_update :
34+ type : timestamp_ntz
35+ description : last update timestamp
36+ latitude :
37+ type : double
38+ description : location (latitude)
39+ longitude :
40+ type : double
41+ description : location (longitude)
42+ confirmed :
43+ type : int
44+ description : number of confirmed cases
45+ combined_key :
46+ type : string
47+ description : county name+state name
48+ quality :
49+ type : SodaCL
50+ specification :
51+ checks for covid_cases :
52+ - freshness(last_update::datetime) < 5000d # dataset is not updated anymore
53+ - row_count > 1000
Original file line number Diff line number Diff line change 1+ package io .github .datacatering .plan
2+
3+ import io .github .datacatering .datacaterer .api .PlanRun
4+
5+ class AdvancedDataContractCliPlanRun extends PlanRun {
6+
7+ val accountTask = csv(" customer_accounts" , " /opt/app/data/customer/account-datacontract-cli" , Map (" header" -> " true" ))
8+ .schema(metadataSource.dataContractCli(" /opt/app/mount/datacontract-cli/datacontract.yaml" ))
9+ .schema(
10+ field.name(" latitude" ).min(- 90 ).max(90 ),
11+ field.name(" longitude" ).min(- 180 ).max(180 ),
12+ field.name(" country_region" ).expression(" #{Address.state}" )
13+ )
14+ .count(count.records(100 ))
15+
16+ val conf = configuration.enableGeneratePlanAndTasks(true )
17+ .generatedReportsFolderPath(" /opt/app/data/report" )
18+
19+ execute(conf, accountTask)
20+ }
You can’t perform that action at this time.
0 commit comments