Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@
"onCreateCommand": {
"python": ["uv", "python", "install", "--default"],
"venv": ["uv", "sync", "--all-extras", "--all-groups"],
"npm": ["npm", "install"]
"npm": ["npm", "install"],
"greenmask": "curl -fsSL https://greenmask.io/install.sh | sh -s -- -y -v v0.2.21"
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be put into the Dockerfile?

},
// Ensure it is re-synced on restarts.
"updateContentCommand": {
Expand Down
63 changes: 63 additions & 0 deletions .github/workflows/greenmask.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: greenmask
# Catch greenmask config changes that can no longer legally run against our
# schema (a renamed/dropped column, a broken subset condition, etc.) by
# validating and dumping against a freshly-migrated throwaway database. Only
# runs when the greenmask config or dump script changes.
on:
pull_request:
paths:
- .greenmask/**
- dev/greenmask-dump.sh
- .github/workflows/greenmask.yml
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.ref_name }}
cancel-in-progress: true
jobs:
validate:
runs-on: ubuntu-24.04
services:
postgres:
image: pgvector/pgvector:pg17
env:
POSTGRES_DB: django
POSTGRES_PASSWORD: postgres
options: >-
--health-cmd "pg_isready --username postgres"
--health-start-period 30s
--health-start-interval 2s
ports:
- 5432:5432
env:
# Used by .greenmask/config.yml's dump dbname and Django's database.
DATABASE_URL: postgres://postgres:postgres@localhost:5432/django
DJANGO_DATABASE_URL: postgres://postgres:postgres@localhost:5432/django
DJANGO_SETTINGS_MODULE: isic.settings.testing
# The settings module reads these at import time but migrate doesn't
# connect to them, so the values just need to be present and well-formed.
DJANGO_ISIC_ELASTICSEARCH_URL: http://elastic:elastic@localhost:9200
DJANGO_CELERY_BROKER_URL: amqp://localhost:5672/
DJANGO_MINIO_STORAGE_URL: http://minioAccessKey:minioSecretKey@localhost:9000/django-storage
DJANGO_CACHE_URL: redis://localhost:6379/0
# The transformer templates reference env "SALT"; any value works here.
SALT: ci-test-salt
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Install uv
uses: astral-sh/setup-uv@v8.1.0
- name: Install greenmask
run: curl -fsSL https://greenmask.io/install.sh | sh -s -- -y -v v0.2.21
- name: Build the Django schema
run: ./manage.py migrate
- name: Validate greenmask config against the schema
# --warnings surfaces config issues against the live schema. Greenmask
# only exits non-zero on fatal errors (e.g. a transformer pointed at a
# column that no longer exists), not on softer warnings, so this catches
# config that genuinely can't run.
run: greenmask --config .greenmask/config.yml validate --warnings
- name: Dump through greenmask against the local database
# Exercises the subset conditions and transformers as real SQL/dump
# operations against the schema, without Heroku or the restore step.
run: dev/greenmask-dump.sh --dump-only
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ isic/core/static/core/dist/
dev/local-setup.sh
docker-compose.local.yml

.greenmask/dumps/

# Additional Celery Beat files
celerybeat-schedule*

Expand Down
286 changes: 286 additions & 0 deletions .greenmask/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
storage:
type: directory
directory:
path: "./.greenmask/dumps"

restore:
pg_restore_options:
dbname: "postgresql://postgres:postgres@postgres:5432/django"
# Drops & recreates objects before restoring so it's idempotent. DESTRUCTIVE
# to the target DB - only point this at a throwaway/scratch database.
clean: true
if-exists: true
no-owner: true
no-privileges: true
scripts:
post-data:
# The subset restore changes collection membership, so the
# materialized_collection_counts matview must be recomputed from the
# restored base tables. Non-CONCURRENTLY since a fresh restore may leave it
# unpopulated, which CONCURRENTLY can't refresh.
- name: "refresh materialized_collection_counts"
when: "after"
query: "REFRESH MATERIALIZED VIEW materialized_collection_counts;"

dump:
pg_dump_options:
dbname: "${DATABASE_URL}"
exclude-table-data:
- "django_session"
- "oauth2_provider_accesstoken"
- "oauth2_provider_refreshtoken"
- "socialaccount_socialtoken"
- "socialaccount_socialaccount"


transformation:

# ============ USERS ============
- schema: "public"
name: "auth_user"
transformers:
- name: "Template"
params:
column: "username"
template: 'user_{{ substr 0 12 (sha256sum (printf "%s:username:%s" (env "SALT") .GetValue)) }}'
- name: "Template"
params:
column: "email"
template: 'user{{ substr 0 8 (sha256sum (printf "%s:email:%s" (env "SALT") .GetValue)) }}@example.test'
- name: "Template"
params:
column: "first_name"
template: '{{ fakerFirstName }}'
- name: "Template"
params:
column: "last_name"
template: '{{ fakerFirstLastName }}'
- name: "Replace"
params:
column: "password"
# Replace with the exact output of make_password("password").
value: "pbkdf2_sha256$870000$REPLACE_WITH_REAL_HASH..."

# ============ EMAIL ADDRESSES (allauth) ============
- schema: "public"
name: "account_emailaddress"
transformers:
- name: "Template"
params:
column: "email"
template: 'user{{ substr 0 8 (sha256sum (printf "%s:email:%s" (env "SALT") .GetValue)) }}@example.test'

# ============ PROFILES (hash_id, 5-char, UNIQUE) ============
# Encode the row's auto-increment id as 5 base-32 digits over the hash_id
# alphabet (A-Z0-9 minus I,O,1,0 -> exactly 32 chars, 32^5 = 2^25 ids). This
# is a bijection (unique id -> unique hash_id), so it can't collide the way a
# truncated hash does, and stays regex-compliant. Hard ceiling at id < 2^25.
- schema: "public"
name: "login_profile"
transformers:
- name: "Template"
params:
column: "hash_id"
template: |-
{{- $alpha := "23456789ABCDEFGHJKLMNPQRSTUVWXYZ" -}}
{{- $n := .GetColumnValue "id" | int -}}
{{- $d0 := int (mod $n 32) -}}
{{- $d1 := int (mod (div $n 32) 32) -}}
{{- $d2 := int (mod (div $n 1024) 32) -}}
{{- $d3 := int (mod (div $n 32768) 32) -}}
{{- $d4 := int (mod (div $n 1048576) 32) -}}
{{ substr $d4 (int (add $d4 1)) $alpha }}{{ substr $d3 (int (add $d3 1)) $alpha }}{{ substr $d2 (int (add $d2 1)) $alpha }}{{ substr $d1 (int (add $d1 1)) $alpha }}{{ substr $d0 (int (add $d0 1)) $alpha }}

# ============ CONTRIBUTORS ============
- schema: "public"
name: "ingest_contributor"
transformers:
- name: "Template"
params:
column: "institution_name"
template: '{{ fakerWord | title }} {{ fakerWord | title }} Institute'
# Real-looking random URL. Note: RandomURL replaces every non-NULL value,
# so previously-empty institution_url values become a generated URL too.
- name: "RandomURL"
params:
column: "institution_url"
- name: "Template"
params:
column: "legal_contact_info"
template: '{{ fakerSentence }}'
- name: "Template"
params:
column: "default_attribution"
template: '{{ fakerWord | title }} {{ fakerWord | title }} Institute'

# ============ PRIVATE IDs (Patient / Lesion / RcmCase) ============
- schema: "public"
name: "ingest_patient"
transformers:
- name: "Template"
params:
column: "private_patient_id"
template: 'anon_patient_{{ substr 0 12 (sha256sum (printf "%s:patient:%s" (env "SALT") .GetValue)) }}'
- schema: "public"
name: "ingest_lesion"
transformers:
- name: "Template"
params:
column: "private_lesion_id"
template: 'anon_lesion_{{ substr 0 12 (sha256sum (printf "%s:lesion:%s" (env "SALT") .GetValue)) }}'
- schema: "public"
name: "ingest_rcmcase"
transformers:
- name: "Template"
params:
column: "private_rcm_case_id"
template: 'anon_rcm_case_{{ substr 0 12 (sha256sum (printf "%s:rcm_case:%s" (env "SALT") .GetValue)) }}'

# ============ ACCESSIONS (preserve file extension) ============
- schema: "public"
name: "ingest_accession"
# Deterministic percentage sample, controlled by ACCESSION_SUBSET_PERCENT.
# Greenmask cascades this through FKs to keep referential integrity. Use a
# deterministic predicate (NOT random()), which would be re-evaluated per
# scan and break parent/child consistency.
subset_conds:
- "(public.ingest_accession.id % 100) < ${ACCESSION_SUBSET_PERCENT:-5}"
transformers:
# original_blob_name is the uploader's source filename, which should be
# discarded. Hash the name but preserve the file extension so
# extension-aware code/UX still behaves. Always populated (editable=False,
# set from the blob name); unique per (cohort_id, name).
- name: "Template"
params:
column: "original_blob_name"
template: '{{ $v := .GetValue }}{{ if contains "." $v }}anon_image_{{ substr 0 16 (sha256sum (printf "%s:blob:%s" (env "SALT") $v)) }}.{{ last (splitList "." $v) }}{{ else }}anon_image_{{ substr 0 16 (sha256sum (printf "%s:blob:%s" (env "SALT") $v)) }}{{ end }}'
- name: "Template"
params:
column: "attribution"
template: '{{ if eq .GetValue "" }}{{ else }}{{ fakerWord | title }} {{ fakerWord | title }} Institute{{ end }}'

# ============ IMAGE EMBEDDINGS (3584-dim halfvec; huge) ============
# Embedding -> image -> accession. The accession subset already cascades here
# (only embeddings for subsetted accessions survive); this further keeps just
# ~10% of those. Sampling on a different id segment ((accession_id / 100) % 10)
# so it's an independent 10% slice of the subset regardless of
# ACCESSION_SUBSET_PERCENT.
- schema: "public"
name: "core_imageembedding"
subset_conds:
- >-
public.core_imageembedding.image_id IN (
SELECT public.core_image.id FROM public.core_image
WHERE ((public.core_image.accession_id / 100) % 10) = 0
)

# ============ PRIVATE COLLECTIONS ============
# Only transform private collections; leave public ones' name/description
# intact.
- schema: "public"
name: "core_collection"
transformers:
- name: "Template"
params:
column: "name"
template: '{{ if .GetColumnValue "public" }}{{ .GetValue }}{{ else }}{{ fakerWord | title }} {{ fakerWord | title }} {{ fakerWord | title }} {{ fakerWord | title }} Collection{{ end }}'
- name: "Template"
params:
column: "description"
template: '{{ if .GetColumnValue "public" }}{{ .GetValue }}{{ else if eq .GetValue "" }}{{ else }}{{ fakerParagraph }}{{ end }}'

# ============ COHORTS ============
- schema: "public"
name: "ingest_cohort"
transformers:
- name: "Template"
params:
column: "name"
template: '{{ fakerWord | title }} {{ fakerWord | title }} Cohort'
- name: "Template"
params:
column: "description"
template: '{{ fakerParagraph }}'
- name: "Template"
params:
column: "default_attribution"
template: '{{ fakerWord | title }} {{ fakerWord | title }} Institute'

# ============ STUDIES ============
# Only transform private studies; leave public ones'
# name/description/attribution intact.
- schema: "public"
name: "studies_study"
transformers:
- name: "Template"
params:
column: "name"
template: '{{ if .GetColumnValue "public" }}{{ .GetValue }}{{ else }}{{ fakerWord | title }} {{ fakerWord | title }} {{ fakerWord | title }} {{ fakerWord | title }} Study{{ end }}'
- name: "Template"
params:
column: "description"
template: '{{ if .GetColumnValue "public" }}{{ .GetValue }}{{ else if eq .GetValue "" }}{{ else }}{{ fakerParagraph }}{{ end }}'
- name: "Template"
params:
column: "attribution"
template: '{{ if .GetColumnValue "public" }}{{ .GetValue }}{{ else }}{{ fakerWord | title }} {{ fakerWord | title }} Institute{{ end }}'

# ============ UPLOAD BLOB NAMES (ZipUpload, MetadataFile) ============
# blob_name is the storage key derived from the uploaded filename, so it can
# leak the original (identifying) name. Same scheme as original_blob_name:
# salt-hash the name, keep the extension. Always populated (editable=False).
- schema: "public"
name: "ingest_zipupload"
transformers:
- name: "Template"
params:
column: "blob_name"
template: '{{ $v := .GetValue }}{{ if contains "." $v }}anon_upload_{{ substr 0 16 (sha256sum (printf "%s:blob_name:%s" (env "SALT") $v)) }}.{{ last (splitList "." $v) }}{{ else }}anon_upload_{{ substr 0 16 (sha256sum (printf "%s:blob_name:%s" (env "SALT") $v)) }}{{ end }}'
- schema: "public"
name: "ingest_metadatafile"
transformers:
- name: "Template"
params:
column: "blob_name"
template: '{{ $v := .GetValue }}{{ if contains "." $v }}anon_upload_{{ substr 0 16 (sha256sum (printf "%s:blob_name:%s" (env "SALT") $v)) }}.{{ last (splitList "." $v) }}{{ else }}anon_upload_{{ substr 0 16 (sha256sum (printf "%s:blob_name:%s" (env "SALT") $v)) }}{{ end }}'

# ============ JSON WIPES ============
# ingest_unstructuredmetadata is a OneToOne on accession, so the row must
# exist for every dumped accession (code accesses accession.unstructured_
# metadata). Keep the rows but Replace value with {} -- greenmask applies the
# transform during dump, so the real (possibly sensitive) JSON never leaves
# the source DB.
- schema: "public"
name: "ingest_unstructuredmetadata"
transformers:
- name: "Replace"
params:
column: "value"
value: "{}"
# ingest_metadataversion is a large table that takes a long time to dump. On
# top of the accession subset, keep it for only ~10% of the selected
# accessions (accession_id % 10 == 0) to keep dumps fast.
- schema: "public"
name: "ingest_metadataversion"
subset_conds:
- "(public.ingest_metadataversion.accession_id % 10) = 0"
transformers:
- name: "Replace"
params: { column: "unstructured_metadata", value: "{}" }
- name: "Replace"
params: { column: "patient", value: "{}" }
- name: "Replace"
params: { column: "lesion", value: "{}" }
- name: "Replace"
params: { column: "rcm_case", value: "{}" }

# ============ OAUTH APPLICATIONS ============
# App names aren't sensitive, so keep them as-is; only the credentials need
# rotating so the dump can't be used to authenticate against real clients.
- schema: "public"
name: "core_isicoauthapplication"
transformers:
- name: "RandomString"
params: { column: "client_id", min_length: 40, max_length: 40 }
- name: "RandomString"
params: { column: "client_secret", min_length: 64, max_length: 64 }
Loading
Loading