verily-src · vrajat44 · May 8, 2026 · May 15, 2026 · May 15, 2026
@@ -0,0 +1,18 @@
+{
+    "name": "WB Data Catalog v2",
+    "dockerComposeFile": "../docker-compose.yaml",
+    "service": "app",
+    "workspaceFolder": "/workspace",
+    "forwardPorts": [8080],
+    "postCreateCommand": "cd /app && pip install --no-cache-dir -r requirements.txt && pip install -e /workspace/packages/verily-profiler && pip install -e '/workspace/packages/verily-chat[agent]' && cd /workspace/frontend && npm install",
+    "postStartCommand": "cd /app && ./start.sh",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "esbenp.prettier-vscode"
+            ]
+        }
+    }
+}
@@ -0,0 +1,32 @@
+# WB Data Catalog — multi-stage: Vite frontend + FastAPI backend
+FROM node:20-bookworm-slim AS frontend-build
+WORKDIR /build
+COPY frontend/package.json ./
+RUN npm install --no-audit --no-fund
+COPY frontend/ ./
+RUN npm run build
+
+FROM python:3.11-slim-bookworm
+RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+# Install local packages first (they are dependencies for the backend)
+COPY packages/verily-profiler /tmp/verily-profiler
+RUN pip install --no-cache-dir /tmp/verily-profiler && rm -rf /tmp/verily-profiler
+
+COPY packages/verily-chat /tmp/verily-chat
+RUN pip install --no-cache-dir "/tmp/verily-chat[agent]" && rm -rf /tmp/verily-chat
+
+COPY backend/requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY backend/ /app/
+COPY start.sh /app/start.sh
+RUN chmod +x /app/start.sh
+COPY --from=frontend-build /build/dist /app/static
+ENV FRONTEND_DIST=/app/static
+EXPOSE 8080
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s CMD curl -f http://localhost:8080/api/health || exit 1
+CMD ["./start.sh"]
@@ -0,0 +1,111 @@
+# WB Data Catalog
+
+React + FastAPI app: browse all BigQuery datasets/tables in a GCP project, preview capped rows, run **technical (C2a)** and **semantic (C2b)** profiling on demand, view profiles from GCS, and get **LLM-suggested charts** (Gemini).
+
+UI uses lightweight **Verily Pre–inspired** tokens and RDS-shaped primitives in `frontend/src/components/rds.tsx` — swap in `@verily-src/rds-*` when your npm registry is configured.
+
+## Environment variables
+
+| Variable | Description |
+|----------|-------------|
+| `GCP_PROJECT_ID` | Billing / ADC project for BigQuery jobs and Vertex AI |
+| `DATA_PROJECT_ID` | Project whose datasets are listed (defaults to `GCP_PROJECT_ID`) |
+| `PROFILE_GCS_BUCKET` | Bucket name (no `gs://`) where `profiling/{project}/{dataset}/{table}/tech_profile.json` and `semantic_profile.json` are stored |
+| `GEMINI_MODEL` | Optional override (e.g. `gemini-2.5-flash`) |
+| `FRONTEND_DIST` | Optional path to built SPA (default: `backend/static` in Docker image) |
+
+## Local development
+
+### One-time setup
+
+Create the backend venv and install dependencies (from the catalog root):
+
+```bash
+cd backend
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+pip install -e ../packages/verily-profiler
+pip install -e "../packages/verily-chat[agent]"
+```
+
+Install frontend dependencies:
+
+```bash
+cd frontend
+npm install
+```
+
+### Running the app
+
+Start **both** servers — the Vite dev server proxies `/api` calls to the backend automatically.
+
+**Terminal 1 — Backend** (from `backend/`):
+
+```bash
+cd backend
+source .venv/bin/activate
+export GCP_PROJECT_ID=your-billing-project
+export DATA_PROJECT_ID=your-data-project   # optional, defaults to GCP_PROJECT_ID
+uvicorn main:app --host 127.0.0.1 --port 8080
+```
+
+**Terminal 2 — Frontend** (from `frontend/`):
+
+```bash
+cd frontend
+npm run dev
+```
+
+Open http://localhost:5173/ in your browser. The frontend runs on port 5173 and proxies all `/api/*` requests to the backend on port 8080.
+
+### Production-style single-process serving
+
+Build the SPA into `backend/static` so uvicorn serves everything:
+
+```bash
+cd frontend && npm run build && mkdir -p ../backend/static && rm -rf ../backend/static/* && cp -r dist/* ../backend/static/
+cd ../backend && source .venv/bin/activate && FRONTEND_DIST=./static uvicorn main:app --host 0.0.0.0 --port 8080
+```
+
+## Docker / Compute Engine
+
+**Workbench / local compose:** `docker-compose.yaml` follows [workbench-app-devcontainers](https://github.com/vrajat44/workbench-app-devcontainers/blob/master/README.md) (same pattern as `src/example/docker-compose.yaml`): `container_name: application-server`, external **`app-network`**, and FUSE flags for gcsfuse. Before `docker compose up` locally, create the network once:
+
+```bash
+docker network create app-network
+```
+
+Then:
+
+```bash
+docker compose build
+export GCP_PROJECT_ID=...
+export DATA_PROJECT_ID=...   # optional; defaults to billing project
+docker compose up
+```
+
+Workbench creates `app-network` in its environment; you do not manage that in the cloud UI.
+
+On **Compute Engine**, use a service account with:
+
+- BigQuery: `bigquery.jobs.create`, read metadata and table data for preview/profiling
+- Storage: read/write objects on `PROFILE_GCS_BUCKET`
+- Vertex AI: Gemini access in your region
+
+Reserve a static external IP, allow TCP **8080** in firewall rules, then open `http://<EXTERNAL_IP>:8080`.
+
+## API summary
+
+- `GET /api/catalog` — all datasets + tables + profiling flags from GCS index
+- `GET /api/projects/{p}/datasets/{d}/tables/{t}/preview` — capped preview
+- `POST .../profile/technical` / `POST .../profile/semantic` — start profiling (async)
+- `GET .../profile/status` — `{ technical, semantic }` states
+- `GET .../profile/technical` / `.../semantic` — JSON profiles
+- `POST /api/charts/suggest` — body `{ technical, semantic? }` → suggested charts
+
+## Repo layout
+
+- `backend/` — FastAPI, BQ preview/discovery, profiling runner, chart advisor, vendored `profiler/` package from WB Data Profiler
+- `frontend/` — Vite + React + Recharts
+- `Dockerfile` / `docker-compose.yaml` — production-style container
@@ -0,0 +1,58 @@
+"""Pydantic models for API responses."""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class DatasetSummary(BaseModel):
+    dataset_id: str
+    table_count: int = 0
+
+
+class TableSummary(BaseModel):
+    fq_table: str
+    project_id: str
+    dataset_id: str
+    table_id: str
+    row_count: Optional[int] = None
+    size_bytes: Optional[int] = None
+    table_type: str = "BASE TABLE"
+    column_count: int = 0
+    creation_time: Optional[str] = None
+    profiling: dict[str, str] = Field(
+        default_factory=lambda: {"technical": "none", "semantic": "none"}
+    )
+    business_name: Optional[str] = None
+    table_definition: Optional[str] = None
+
+
+class CatalogResponse(BaseModel):
+    project_id: str
+    profile_bucket: str
+    datasets: list[dict[str, Any]]
+
+
+class JobStartResponse(BaseModel):
+    job_id: str
+    status: str = "running"
+
+
+class ProfileStatusResponse(BaseModel):
+    technical: str
+    semantic: str
+
+
+class ChartSuggestion(BaseModel):
+    model_config = ConfigDict(extra="ignore")
+
+    chart_type: str = "bar"
+    columns: list[str] = Field(default_factory=list)
+    title: str = ""
+    rationale: str = ""
+
+
+class ChartsSuggestResponse(BaseModel):
+    charts: list[ChartSuggestion]
@@ -0,0 +1,86 @@
+"""
+BigQuery table preview: capped sample rows + schema (read-only).
+"""
+
+from __future__ import annotations
+
+import json
+from decimal import Decimal
+from typing import Any, Optional
+
+from google.cloud import bigquery
+
+from verily_profiler.models import BQTableInfo
+
+MAX_PREVIEW_ROWS = 500
+MAX_EXPLORE_ROWS = 5000
+
+
+def _serialize_cell(val: Any) -> Any:
+    if val is None:
+        return None
+    if isinstance(val, (bytes,)):
+        return val.decode("utf-8", errors="replace")
+    if isinstance(val, Decimal):
+        return float(val)
+    if hasattr(val, "isoformat"):
+        return val.isoformat()
+    if isinstance(val, (dict, list)):
+        return json.loads(json.dumps(val, default=str))
+    return val
+
+
+def preview_table(
+    table_info: BQTableInfo,
+    billing_project_id: Optional[str] = None,
+    limit: int = MAX_PREVIEW_ROWS,
+) -> dict[str, Any]:
+    """
+    Run SELECT * FROM table LIMIT N. Returns columns + rows (JSON-serializable).
+    """
+    limit = max(1, min(int(limit), MAX_EXPLORE_ROWS))
+    client = bigquery.Client(project=billing_project_id or table_info.project_id)
+    fq = f"`{table_info.project_id}.{table_info.dataset_id}.{table_info.table_id}`"
+    sql = f"SELECT * FROM {fq} LIMIT {limit}"
+
+    query_job = client.query(sql)
+    rows = list(query_job.result())
+    schema_fields = list(query_job.schema or [])
+
+    if schema_fields:
+        columns_meta = [
+            {
+                "name": f.name,
+                "type": f.field_type,
+                "mode": getattr(f, "mode", "NULLABLE"),
+            }
+            for f in schema_fields
+        ]
+    elif table_info.columns:
+        columns_meta = [
+            {
+                "name": c.column_name,
+                "type": c.data_type,
+                "mode": "NULLABLE" if c.is_nullable == "YES" else "REQUIRED",
+                "description": c.description,
+            }
+            for c in table_info.columns
+        ]
+    elif rows:
+        columns_meta = [{"name": k, "type": "UNKNOWN", "mode": "NULLABLE"} for k in rows[0].keys()]
+    else:
+        columns_meta = []
+
+    out_rows: list[list[Any]] = []
+    for r in rows:
+        out_rows.append([_serialize_cell(r[k]) for k in r.keys()])
+
+    total_rows = table_info.row_count
+    return {
+        "fq_table": table_info.fq_name,
+        "columns": columns_meta,
+        "rows": out_rows,
+        "preview_row_count": len(out_rows),
+        "total_rows": total_rows,
+        "size_bytes": table_info.size_bytes,
+    }