diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 913ff900b..7a345d53b 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -84,7 +84,8 @@ "onCreateCommand": { "python": ["uv", "python", "install", "--default"], "venv": ["uv", "sync", "--all-extras", "--all-groups"], - "npm": ["npm", "install"] + "npm": ["npm", "install"], + "greenmask": "curl -fsSL https://greenmask.io/install.sh | sh -s -- -y -v v0.2.21" }, // Ensure it is re-synced on restarts. "updateContentCommand": { diff --git a/.github/workflows/greenmask.yml b/.github/workflows/greenmask.yml new file mode 100644 index 000000000..5efe6b66d --- /dev/null +++ b/.github/workflows/greenmask.yml @@ -0,0 +1,63 @@ +name: greenmask +# Catch greenmask config changes that can no longer legally run against our +# schema (a renamed/dropped column, a broken subset condition, etc.) by +# validating and dumping against a freshly-migrated throwaway database. Only +# runs when the greenmask config or dump script changes. +on: + pull_request: + paths: + - .greenmask/** + - dev/greenmask-dump.sh + - .github/workflows/greenmask.yml +permissions: + contents: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }} + cancel-in-progress: true +jobs: + validate: + runs-on: ubuntu-24.04 + services: + postgres: + image: pgvector/pgvector:pg17 + env: + POSTGRES_DB: django + POSTGRES_PASSWORD: postgres + options: >- + --health-cmd "pg_isready --username postgres" + --health-start-period 30s + --health-start-interval 2s + ports: + - 5432:5432 + env: + # Used by .greenmask/config.yml's dump dbname and Django's database. + DATABASE_URL: postgres://postgres:postgres@localhost:5432/django + DJANGO_DATABASE_URL: postgres://postgres:postgres@localhost:5432/django + DJANGO_SETTINGS_MODULE: isic.settings.testing + # The settings module reads these at import time but migrate doesn't + # connect to them, so the values just need to be present and well-formed. + DJANGO_ISIC_ELASTICSEARCH_URL: http://elastic:elastic@localhost:9200 + DJANGO_CELERY_BROKER_URL: amqp://localhost:5672/ + DJANGO_MINIO_STORAGE_URL: http://minioAccessKey:minioSecretKey@localhost:9000/django-storage + DJANGO_CACHE_URL: redis://localhost:6379/0 + # The transformer templates reference env "SALT"; any value works here. + SALT: ci-test-salt + steps: + - name: Checkout repository + uses: actions/checkout@v6 + - name: Install uv + uses: astral-sh/setup-uv@v8.1.0 + - name: Install greenmask + run: curl -fsSL https://greenmask.io/install.sh | sh -s -- -y -v v0.2.21 + - name: Build the Django schema + run: ./manage.py migrate + - name: Validate greenmask config against the schema + # --warnings surfaces config issues against the live schema. Greenmask + # only exits non-zero on fatal errors (e.g. a transformer pointed at a + # column that no longer exists), not on softer warnings, so this catches + # config that genuinely can't run. + run: greenmask --config .greenmask/config.yml validate --warnings + - name: Dump through greenmask against the local database + # Exercises the subset conditions and transformers as real SQL/dump + # operations against the schema, without Heroku or the restore step. + run: dev/greenmask-dump.sh --dump-only diff --git a/.gitignore b/.gitignore index e0418a79a..630f4fc81 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ isic/core/static/core/dist/ dev/local-setup.sh docker-compose.local.yml +.greenmask/dumps/ + # Additional Celery Beat files celerybeat-schedule* diff --git a/.greenmask/config.yml b/.greenmask/config.yml new file mode 100644 index 000000000..db7d32d92 --- /dev/null +++ b/.greenmask/config.yml @@ -0,0 +1,286 @@ +storage: + type: directory + directory: + path: "./.greenmask/dumps" + +restore: + pg_restore_options: + dbname: "postgresql://postgres:postgres@postgres:5432/django" + # Drops & recreates objects before restoring so it's idempotent. DESTRUCTIVE + # to the target DB - only point this at a throwaway/scratch database. + clean: true + if-exists: true + no-owner: true + no-privileges: true + scripts: + post-data: + # The subset restore changes collection membership, so the + # materialized_collection_counts matview must be recomputed from the + # restored base tables. Non-CONCURRENTLY since a fresh restore may leave it + # unpopulated, which CONCURRENTLY can't refresh. + - name: "refresh materialized_collection_counts" + when: "after" + query: "REFRESH MATERIALIZED VIEW materialized_collection_counts;" + +dump: + pg_dump_options: + dbname: "${DATABASE_URL}" + exclude-table-data: + - "django_session" + - "oauth2_provider_accesstoken" + - "oauth2_provider_refreshtoken" + - "socialaccount_socialtoken" + - "socialaccount_socialaccount" + + + transformation: + + # ============ USERS ============ + - schema: "public" + name: "auth_user" + transformers: + - name: "Template" + params: + column: "username" + template: 'user_{{ substr 0 12 (sha256sum (printf "%s:username:%s" (env "SALT") .GetValue)) }}' + - name: "Template" + params: + column: "email" + template: 'user{{ substr 0 8 (sha256sum (printf "%s:email:%s" (env "SALT") .GetValue)) }}@example.test' + - name: "Template" + params: + column: "first_name" + template: '{{ fakerFirstName }}' + - name: "Template" + params: + column: "last_name" + template: '{{ fakerFirstLastName }}' + - name: "Replace" + params: + column: "password" + # Replace with the exact output of make_password("password"). + value: "pbkdf2_sha256$870000$REPLACE_WITH_REAL_HASH..." + + # ============ EMAIL ADDRESSES (allauth) ============ + - schema: "public" + name: "account_emailaddress" + transformers: + - name: "Template" + params: + column: "email" + template: 'user{{ substr 0 8 (sha256sum (printf "%s:email:%s" (env "SALT") .GetValue)) }}@example.test' + + # ============ PROFILES (hash_id, 5-char, UNIQUE) ============ + # Encode the row's auto-increment id as 5 base-32 digits over the hash_id + # alphabet (A-Z0-9 minus I,O,1,0 -> exactly 32 chars, 32^5 = 2^25 ids). This + # is a bijection (unique id -> unique hash_id), so it can't collide the way a + # truncated hash does, and stays regex-compliant. Hard ceiling at id < 2^25. + - schema: "public" + name: "login_profile" + transformers: + - name: "Template" + params: + column: "hash_id" + template: |- + {{- $alpha := "23456789ABCDEFGHJKLMNPQRSTUVWXYZ" -}} + {{- $n := .GetColumnValue "id" | int -}} + {{- $d0 := int (mod $n 32) -}} + {{- $d1 := int (mod (div $n 32) 32) -}} + {{- $d2 := int (mod (div $n 1024) 32) -}} + {{- $d3 := int (mod (div $n 32768) 32) -}} + {{- $d4 := int (mod (div $n 1048576) 32) -}} + {{ substr $d4 (int (add $d4 1)) $alpha }}{{ substr $d3 (int (add $d3 1)) $alpha }}{{ substr $d2 (int (add $d2 1)) $alpha }}{{ substr $d1 (int (add $d1 1)) $alpha }}{{ substr $d0 (int (add $d0 1)) $alpha }} + + # ============ CONTRIBUTORS ============ + - schema: "public" + name: "ingest_contributor" + transformers: + - name: "Template" + params: + column: "institution_name" + template: '{{ fakerWord | title }} {{ fakerWord | title }} Institute' + # Real-looking random URL. Note: RandomURL replaces every non-NULL value, + # so previously-empty institution_url values become a generated URL too. + - name: "RandomURL" + params: + column: "institution_url" + - name: "Template" + params: + column: "legal_contact_info" + template: '{{ fakerSentence }}' + - name: "Template" + params: + column: "default_attribution" + template: '{{ fakerWord | title }} {{ fakerWord | title }} Institute' + + # ============ PRIVATE IDs (Patient / Lesion / RcmCase) ============ + - schema: "public" + name: "ingest_patient" + transformers: + - name: "Template" + params: + column: "private_patient_id" + template: 'anon_patient_{{ substr 0 12 (sha256sum (printf "%s:patient:%s" (env "SALT") .GetValue)) }}' + - schema: "public" + name: "ingest_lesion" + transformers: + - name: "Template" + params: + column: "private_lesion_id" + template: 'anon_lesion_{{ substr 0 12 (sha256sum (printf "%s:lesion:%s" (env "SALT") .GetValue)) }}' + - schema: "public" + name: "ingest_rcmcase" + transformers: + - name: "Template" + params: + column: "private_rcm_case_id" + template: 'anon_rcm_case_{{ substr 0 12 (sha256sum (printf "%s:rcm_case:%s" (env "SALT") .GetValue)) }}' + + # ============ ACCESSIONS (preserve file extension) ============ + - schema: "public" + name: "ingest_accession" + # Deterministic percentage sample, controlled by ACCESSION_SUBSET_PERCENT. + # Greenmask cascades this through FKs to keep referential integrity. Use a + # deterministic predicate (NOT random()), which would be re-evaluated per + # scan and break parent/child consistency. + subset_conds: + - "(public.ingest_accession.id % 100) < ${ACCESSION_SUBSET_PERCENT:-5}" + transformers: + # original_blob_name is the uploader's source filename, which should be + # discarded. Hash the name but preserve the file extension so + # extension-aware code/UX still behaves. Always populated (editable=False, + # set from the blob name); unique per (cohort_id, name). + - name: "Template" + params: + column: "original_blob_name" + template: '{{ $v := .GetValue }}{{ if contains "." $v }}anon_image_{{ substr 0 16 (sha256sum (printf "%s:blob:%s" (env "SALT") $v)) }}.{{ last (splitList "." $v) }}{{ else }}anon_image_{{ substr 0 16 (sha256sum (printf "%s:blob:%s" (env "SALT") $v)) }}{{ end }}' + - name: "Template" + params: + column: "attribution" + template: '{{ if eq .GetValue "" }}{{ else }}{{ fakerWord | title }} {{ fakerWord | title }} Institute{{ end }}' + + # ============ IMAGE EMBEDDINGS (3584-dim halfvec; huge) ============ + # Embedding -> image -> accession. The accession subset already cascades here + # (only embeddings for subsetted accessions survive); this further keeps just + # ~10% of those. Sampling on a different id segment ((accession_id / 100) % 10) + # so it's an independent 10% slice of the subset regardless of + # ACCESSION_SUBSET_PERCENT. + - schema: "public" + name: "core_imageembedding" + subset_conds: + - >- + public.core_imageembedding.image_id IN ( + SELECT public.core_image.id FROM public.core_image + WHERE ((public.core_image.accession_id / 100) % 10) = 0 + ) + + # ============ PRIVATE COLLECTIONS ============ + # Only transform private collections; leave public ones' name/description + # intact. + - schema: "public" + name: "core_collection" + transformers: + - name: "Template" + params: + column: "name" + template: '{{ if .GetColumnValue "public" }}{{ .GetValue }}{{ else }}{{ fakerWord | title }} {{ fakerWord | title }} {{ fakerWord | title }} {{ fakerWord | title }} Collection{{ end }}' + - name: "Template" + params: + column: "description" + template: '{{ if .GetColumnValue "public" }}{{ .GetValue }}{{ else if eq .GetValue "" }}{{ else }}{{ fakerParagraph }}{{ end }}' + + # ============ COHORTS ============ + - schema: "public" + name: "ingest_cohort" + transformers: + - name: "Template" + params: + column: "name" + template: '{{ fakerWord | title }} {{ fakerWord | title }} Cohort' + - name: "Template" + params: + column: "description" + template: '{{ fakerParagraph }}' + - name: "Template" + params: + column: "default_attribution" + template: '{{ fakerWord | title }} {{ fakerWord | title }} Institute' + + # ============ STUDIES ============ + # Only transform private studies; leave public ones' + # name/description/attribution intact. + - schema: "public" + name: "studies_study" + transformers: + - name: "Template" + params: + column: "name" + template: '{{ if .GetColumnValue "public" }}{{ .GetValue }}{{ else }}{{ fakerWord | title }} {{ fakerWord | title }} {{ fakerWord | title }} {{ fakerWord | title }} Study{{ end }}' + - name: "Template" + params: + column: "description" + template: '{{ if .GetColumnValue "public" }}{{ .GetValue }}{{ else if eq .GetValue "" }}{{ else }}{{ fakerParagraph }}{{ end }}' + - name: "Template" + params: + column: "attribution" + template: '{{ if .GetColumnValue "public" }}{{ .GetValue }}{{ else }}{{ fakerWord | title }} {{ fakerWord | title }} Institute{{ end }}' + + # ============ UPLOAD BLOB NAMES (ZipUpload, MetadataFile) ============ + # blob_name is the storage key derived from the uploaded filename, so it can + # leak the original (identifying) name. Same scheme as original_blob_name: + # salt-hash the name, keep the extension. Always populated (editable=False). + - schema: "public" + name: "ingest_zipupload" + transformers: + - name: "Template" + params: + column: "blob_name" + template: '{{ $v := .GetValue }}{{ if contains "." $v }}anon_upload_{{ substr 0 16 (sha256sum (printf "%s:blob_name:%s" (env "SALT") $v)) }}.{{ last (splitList "." $v) }}{{ else }}anon_upload_{{ substr 0 16 (sha256sum (printf "%s:blob_name:%s" (env "SALT") $v)) }}{{ end }}' + - schema: "public" + name: "ingest_metadatafile" + transformers: + - name: "Template" + params: + column: "blob_name" + template: '{{ $v := .GetValue }}{{ if contains "." $v }}anon_upload_{{ substr 0 16 (sha256sum (printf "%s:blob_name:%s" (env "SALT") $v)) }}.{{ last (splitList "." $v) }}{{ else }}anon_upload_{{ substr 0 16 (sha256sum (printf "%s:blob_name:%s" (env "SALT") $v)) }}{{ end }}' + + # ============ JSON WIPES ============ + # ingest_unstructuredmetadata is a OneToOne on accession, so the row must + # exist for every dumped accession (code accesses accession.unstructured_ + # metadata). Keep the rows but Replace value with {} -- greenmask applies the + # transform during dump, so the real (possibly sensitive) JSON never leaves + # the source DB. + - schema: "public" + name: "ingest_unstructuredmetadata" + transformers: + - name: "Replace" + params: + column: "value" + value: "{}" + # ingest_metadataversion is a large table that takes a long time to dump. On + # top of the accession subset, keep it for only ~10% of the selected + # accessions (accession_id % 10 == 0) to keep dumps fast. + - schema: "public" + name: "ingest_metadataversion" + subset_conds: + - "(public.ingest_metadataversion.accession_id % 10) = 0" + transformers: + - name: "Replace" + params: { column: "unstructured_metadata", value: "{}" } + - name: "Replace" + params: { column: "patient", value: "{}" } + - name: "Replace" + params: { column: "lesion", value: "{}" } + - name: "Replace" + params: { column: "rcm_case", value: "{}" } + + # ============ OAUTH APPLICATIONS ============ + # App names aren't sensitive, so keep them as-is; only the credentials need + # rotating so the dump can't be used to authenticate against real clients. + - schema: "public" + name: "core_isicoauthapplication" + transformers: + - name: "RandomString" + params: { column: "client_id", min_length: 40, max_length: 40 } + - name: "RandomString" + params: { column: "client_secret", min_length: 64, max_length: 64 } diff --git a/dev/greenmask-dump.sh b/dev/greenmask-dump.sh new file mode 100755 index 000000000..3ca6c5750 --- /dev/null +++ b/dev/greenmask-dump.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +# +# Build a sanitized local database from production. +# +# Pulls the read-only credential's URL from Heroku, dumps production through +# greenmask (which anonymizes info and subsets the data per config.yml), then +# restores it into the local postgres. +# +# If DATABASE_URL is already set in the environment, it's used as-is and the +# Heroku lookup is skipped. This is what the CI test relies on to dump against a +# throwaway local database instead of production. +# +# Usage: +# dev/greenmask-dump.sh dump from prod, then restore +# dev/greenmask-dump.sh -r|--reuse-existing-dump restore the latest local dump +# only (skips Heroku + dump) +# dev/greenmask-dump.sh -d|--dump-only dump only, skip the restore +set -exuo pipefail + +REUSE_EXISTING_DUMP=0 +DUMP_ONLY=0 +while [[ $# -gt 0 ]]; do + case "$1" in + -r | --reuse-existing-dump) + REUSE_EXISTING_DUMP=1 + shift + ;; + -d | --dump-only) + DUMP_ONLY=1 + shift + ;; + *) + echo "unknown argument: $1" >&2 + exit 1 + ;; + esac +done + +if [[ "$REUSE_EXISTING_DUMP" -eq 1 && "$DUMP_ONLY" -eq 1 ]]; then + echo "-r/--reuse-existing-dump and -d/--dump-only are mutually exclusive" >&2 + exit 1 +fi + +declare -rx HEROKU_APP=isic +declare -rx HEROKU_PG_CREDENTIAL=readonly +# Lowering this barely speeds up the dump: ~75s of the runtime is fixed +# server-side full scans (ingest_metadataversion, ingest_unstructuredmetadata, +# ingest_accession) from non-sargable modulo subset predicates, plus ~22s of +# pg_dump --schema-only startup -- none of which shrink with the percentage. +# Raising it does cost more, though: the download-bound tables (mainly the +# core_imageembedding halfvecs) scale ~linearly with the percentage. +declare -rx ACCESSION_SUBSET_PERCENT=10 + +readonly LOG_LEVEL=info + +# greenmask's directory storage won't create the path for you, so make sure it +# exists before dumping. +mkdir -p ./.greenmask/dumps + +if [[ "$REUSE_EXISTING_DUMP" -eq 0 ]]; then + if [[ -z "${DATABASE_URL:-}" ]]; then + DATABASE_URL=$(heroku pg:credentials:url --name "$HEROKU_PG_CREDENTIAL" | + grep -Eo 'postgres://[^[:space:]]+') + export DATABASE_URL + fi + greenmask --config .greenmask/config.yml dump --jobs 2 --pgzip --log-level "$LOG_LEVEL" +fi + +if [[ "$DUMP_ONLY" -eq 1 ]]; then + exit 0 +fi + +# Filter out greenmask's expected-but-noisy phased-restoration warnings without +# swallowing greenmask's exit code (process substitution, not a pipe, so +# pipefail can't mask a real greenmask failure). +readonly NOISE='could not find where to insert IF EXISTS|using neutralized TOC for phased restoration' + +greenmask --config .greenmask/config.yml --log-level "$LOG_LEVEL" restore latest --pgzip \ + 2> >(grep -v -E "$NOISE" >&2) diff --git a/dev/import-db.sh b/dev/import-db.sh deleted file mode 100755 index 813da6e41..000000000 --- a/dev/import-db.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash -set -exuo pipefail - -export HEROKU_APP=isic - -docker-compose start postgres - -docker-compose exec postgres bash -c "PGPASSWORD=postgres dropdb --host localhost --username postgres --if-exists django && createdb --host localhost --username postgres django" - -# public.stats_imagedownload is large and not usually needed -docker-compose exec postgres bash -c \ -"pg_dump --format=custom --no-privileges $(heroku config:get DATABASE_URL)\ - --exclude-table-data public.stats_imagedownload \ - | PGPASSWORD=postgres pg_restore --host localhost --username postgres --format=custom --no-privileges --no-owner --dbname=django" - -docker-compose restart postgres diff --git a/docker-compose.yml b/docker-compose.yml index ea09e1fb9..2d3734779 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,6 +2,9 @@ services: postgres: image: pgvector/pgvector:pg18 shm_size: 128mb + # Bumped so the core_imageembedding ivfflat index can build during greenmask + # restores. + command: ["postgres", "-c", "maintenance_work_mem=512MB"] environment: POSTGRES_DB: django POSTGRES_PASSWORD: postgres