Skip to content
This repository was archived by the owner on Aug 8, 2025. It is now read-only.

Commit b0f03fb

Browse files
committed
Add in Data Contract CLI example
1 parent e314a8d commit b0f03fb

2 files changed

Lines changed: 73 additions & 0 deletions

File tree

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
dataContractSpecification: 0.9.3
2+
id: covid_cases
3+
info:
4+
title: COVID-19 cases
5+
description: Johns Hopkins University Consolidated data on COVID-19 cases, sourced from Enigma
6+
version: "0.0.1"
7+
links:
8+
blog: https://aws.amazon.com/blogs/big-data/a-public-data-lake-for-analysis-of-covid-19-data/
9+
data-explorer: https://dj2taa9i652rf.cloudfront.net/
10+
data: https://covid19-lake.s3.us-east-2.amazonaws.com/enigma-jhu/json/part-00000-adec1cd2-96df-4c6b-a5f2-780f092951ba-c000.json
11+
servers:
12+
s3-json:
13+
type: s3
14+
location: s3://covid19-lake/enigma-jhu/json/*.json
15+
format: json
16+
delimiter: new_line
17+
models:
18+
covid_cases:
19+
description: the number of confirmed covid cases reported for a specified region, with location and county/province/country information.
20+
fields:
21+
fips:
22+
type: string
23+
description: state and county two digits code
24+
admin2:
25+
type: string
26+
description: county name
27+
province_state:
28+
type: string
29+
description: province name or state name
30+
country_region:
31+
type: string
32+
description: country name or region name
33+
last_update:
34+
type: timestamp_ntz
35+
description: last update timestamp
36+
latitude:
37+
type: double
38+
description: location (latitude)
39+
longitude:
40+
type: double
41+
description: location (longitude)
42+
confirmed:
43+
type: int
44+
description: number of confirmed cases
45+
combined_key:
46+
type: string
47+
description: county name+state name
48+
quality:
49+
type: SodaCL
50+
specification:
51+
checks for covid_cases:
52+
- freshness(last_update::datetime) < 5000d # dataset is not updated anymore
53+
- row_count > 1000
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package io.github.datacatering.plan
2+
3+
import io.github.datacatering.datacaterer.api.PlanRun
4+
5+
class AdvancedDataContractCliPlanRun extends PlanRun {
6+
7+
val accountTask = csv("customer_accounts", "/opt/app/data/customer/account-datacontract-cli", Map("header" -> "true"))
8+
.schema(metadataSource.dataContractCli("/opt/app/mount/datacontract-cli/datacontract.yaml"))
9+
.schema(
10+
field.name("latitude").min(-90).max(90),
11+
field.name("longitude").min(-180).max(180),
12+
field.name("country_region").expression("#{Address.state}")
13+
)
14+
.count(count.records(100))
15+
16+
val conf = configuration.enableGeneratePlanAndTasks(true)
17+
.generatedReportsFolderPath("/opt/app/data/report")
18+
19+
execute(conf, accountTask)
20+
}

0 commit comments

Comments
 (0)