diff --git a/.github/workflows/test-pr.yaml b/.github/workflows/test-pr.yaml index 7790bcc65..72a9a7a89 100644 --- a/.github/workflows/test-pr.yaml +++ b/.github/workflows/test-pr.yaml @@ -75,6 +75,15 @@ jobs: - 'features/src/postgres-client/**' - 'src/aou-common/**' - 'src/nemo_jupyter/**' + workbench-jupyter-with-llm: + maximize_build_space: true + filters: + - 'features/src/workbench-tools/**' + - 'features/src/postgres-client/**' + - 'features/src/llm-context/**' + - 'features/src/wb-mcp-server/**' + - 'features/src/gemini-cli/**' + - 'src/jupyter-common/**' workbench-jupyter-parabricks: maximize_build_space: true filters: diff --git a/features/src/llm-context/README.md b/features/src/llm-context/README.md new file mode 100644 index 000000000..84d4bac9c --- /dev/null +++ b/features/src/llm-context/README.md @@ -0,0 +1,131 @@ +# LLM Context Generator (llm-context) + +Generates `~/CLAUDE.md` context file for LLMs (Claude Code, Gemini CLI, etc.) with Workbench workspace information. Claude Code auto-discovers this file on startup. + +## Example Usage + +```json +"features": { + "ghcr.io/verily-src/workbench-app-devcontainers/llm-context:1": { + "username": "jupyter", + "userHomeDir": "/home/jupyter" + } +} +``` + +Or for local development: + +```json +"features": { + "./.devcontainer/features/llm-context": { + "username": "jupyter", + "userHomeDir": "/home/jupyter" + } +} +``` + +## Options + +| Options Id | Description | Type | Default Value | +|-----|-----|-----|-----| +| username | Username of the container user | string | root | +| userHomeDir | Home directory of the container user | string | /root | + +## What It Does + +When installed, this feature: + +1. **Generates `~/CLAUDE.md`** - Claude Code auto-discovers this file on startup +2. **Provides workspace context** - Name, ID, role, resources, cloud paths +3. **Includes skill files** - Detailed guides (e.g., custom app creation) in `~/.claude/skills/` +4. **Sets up aliases** - `generate-llm-context`, `refresh-context` + +## What's in `~/CLAUDE.md` + +- **Quick Rules** - When to use this file vs. MCP/CLI +- **Current Workspace** - Name, ID, description, role, cloud platform +- **Resource Paths** - JSON lookup for all resources (GCS, BigQuery, etc.) +- **Data Persistence** - Warning + save commands +- **Data Exploration** - Common BigQuery/GCS commands +- **MCP Tools** - Available tools and CLI equivalents +- **Skills** - Links to detailed guides + +## When Context Gets Generated + +1. **Automatically on app start** - Via `postStartCommand` (after bucket mounting completes) +2. **Manually** - Run `generate-llm-context` or `refresh-context` + +**Important**: Add the context generation to your `postStartCommand` in `.devcontainer.json`: + +```json +"postStartCommand": [ + "bash", + "-c", + "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && /opt/llm-context/generate-context.sh /home/jupyter" +] +``` + +**Note**: Pass the user home directory (e.g., `/home/jupyter`) as an argument because `postStartCommand` runs as root, not as the container user. + +This ensures context is generated AFTER authentication and workspace setup complete. + +## MCP Integration + +This feature works well alongside the `wb-mcp-server` feature: +- **`llm-context`** provides static context (workspace info, resource paths) +- **`wb-mcp-server`** provides dynamic tools (search, create, modify) + +For optimal LLM experience, use both: + +```json +"features": { + "./.devcontainer/features/llm-context": {}, + "./.devcontainer/features/wb-mcp-server": {} +} +``` + +## Troubleshooting + +### Context not generating? + +```bash +# Check if workspace is set +wb workspace describe + +# If not authenticated: +wb auth login --mode=APP_DEFAULT_CREDENTIALS +wb workspace set + +# Then generate manually: +generate-llm-context +``` + +### Claude Code not seeing context? + +```bash +# Check file exists +ls -la ~/CLAUDE.md + +# Check it's not empty +head ~/CLAUDE.md +``` + +## File Locations + +| File | Purpose | +|------|---------| +| `/opt/llm-context/generate-context.sh` | Main generation script | +| `/opt/llm-context/run-context-generator.sh` | Auto-run wrapper | +| `~/.claude/CLAUDE.md` | Generated context (primary) | +| `~/CLAUDE.md` | Symlink for auto-discovery | +| `~/.claude/skills/` | Skill files (e.g., CUSTOM_APP.md) | + +## Notes + +- This feature requires the Workbench CLI (`wb`) to be installed +- `jq` is automatically installed if not present +- Context is only generated if a workspace is set (`wb workspace describe` succeeds) + +--- + +_Note: This feature is automatically configured to work with the `wb-mcp-server` feature if both are installed._ diff --git a/features/src/llm-context/devcontainer-feature.json b/features/src/llm-context/devcontainer-feature.json new file mode 100644 index 000000000..e052c3936 --- /dev/null +++ b/features/src/llm-context/devcontainer-feature.json @@ -0,0 +1,22 @@ +{ + "id": "llm-context", + "version": "1.2.0", + "name": "LLM Context Generator", + "description": "Generates ~/CLAUDE.md context file for LLMs (Claude Code, Gemini, etc.) with Workbench workspace information. Claude Code auto-discovers this file on startup.", + "options": { + "username": { + "type": "string", + "default": "root", + "description": "Username of the container user." + }, + "userHomeDir": { + "type": "string", + "default": "/root", + "description": "Home directory of the container user." + } + }, + "installsAfter": [ + "ghcr.io/devcontainers/features/common-utils", + "./.devcontainer/features/workbench-tools" + ] +} diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh new file mode 100755 index 000000000..99534c57a --- /dev/null +++ b/features/src/llm-context/generate-context.sh @@ -0,0 +1,866 @@ +#!/bin/bash +# shellcheck disable=SC2016 # Single-quoted strings with $ and backticks are intentional template text +# +# Workbench LLM Context Generator +# +# This script generates a single CLAUDE.md file that provides LLMs (like +# Claude Code) with full context about the current Workbench workspace, +# resources, workflows, and available tools. The file includes embedded +# JSON for machine-readable data. +# +# Usage: ./generate-context.sh +# +# Prerequisites: +# - Workbench CLI (wb) installed and authenticated +# - jq installed for JSON processing +# - Active workspace set (wb workspace set ) +# +# CLI JSON Field Reference: +# Workspace (UFWorkspaceLight.java): +# - id: user-facing ID (e.g., "my-workspace") +# - uuid: UUID +# - name: display name +# - description +# - cloudPlatform: GCP or AWS +# - googleProjectId, awsAccountId +# - highestRole: OWNER, WRITER, READER +# - orgId, podId +# - userEmail +# - createdDate, lastUpdatedDate +# - properties: Map +# +# Resource (UFResource.java): +# - id: resource name +# - uuid +# - description +# - resourceType: GCS_BUCKET, BQ_DATASET, GIT_REPO, GCS_OBJECT, BQ_TABLE (GCP) +# AWS_S3_STORAGE_FOLDER, AWS_AURORA_DATABASE, AWS_AURORA_DATABASE_REFERENCE (AWS) +# - stewardshipType: CONTROLLED, REFERENCED +# - region +# - For GCS: bucketName, location +# - For BQ: projectId, datasetId +# +# Workflow (UFWorkflow.java): +# - id: name +# - workflowId: UUID +# - displayName +# - description +# - bucketSource or gitSource +# + +set -e + +# Configuration — accept an optional home directory argument (e.g., /config, /home/jupyter) +USER_HOME="${1:-${HOME}}" +CONTEXT_DIR="${USER_HOME}/.claude" +SKILLS_DIR="${CONTEXT_DIR}/skills" +CLAUDE_FILE="${CONTEXT_DIR}/CLAUDE.md" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +# Check prerequisites +check_prerequisites() { + log_info "Checking prerequisites..." + + if ! command -v wb &> /dev/null; then + log_error "Workbench CLI (wb) not found. Please install it first." + exit 1 + fi + + if ! command -v jq &> /dev/null; then + log_error "jq is required but not found. Please install jq." + exit 1 + fi + + # Check if workspace is set + if ! wb workspace describe --format=json &> /dev/null; then + log_error "No workspace set or not authenticated. Please run:" + log_error " wb auth login (GCP: add --mode=APP_DEFAULT_CREDENTIALS inside Workbench apps)" + log_error " wb workspace set " + exit 1 + fi + + log_info "Prerequisites OK" +} + +# Create output directory +setup_directories() { + log_info "Setting up directories..." + mkdir -p "${CONTEXT_DIR}" + mkdir -p "${SKILLS_DIR}" +} + +# Install skill files from /opt/llm-context/skills/ (copied at install time) +# $1: cloud_platform — "GCP" (default) or "AWS" +install_skills() { + local cloud_platform="${1:-GCP}" + local source_skills="/opt/llm-context/skills" + log_info "Installing skill files..." + + if [[ ! -d "${source_skills}" ]]; then + log_warn "Skill source directory not found at ${source_skills}, skipping skill installation" + return + fi + + # Copy all base skill files + for skill_file in "${source_skills}"/*.md; do + [[ -f "${skill_file}" ]] && cp "${skill_file}" "${SKILLS_DIR}/" + done + + # Copy scientific skills + if [[ -d "${source_skills}/scientific" ]]; then + mkdir -p "${SKILLS_DIR}/scientific" + for skill_file in "${source_skills}/scientific"/*.md; do + [[ -f "${skill_file}" ]] && cp "${skill_file}" "${SKILLS_DIR}/scientific/" + done + fi + + # AWS-specific skill overrides — overwrite only the platform-sensitive skills. + if [ "$cloud_platform" = "AWS" ] && [[ -d "${source_skills}/aws" ]]; then + log_info "Applying AWS skill variants for WORKFLOW_TROUBLESHOOT and DASHBOARD_BUILDER..." + for skill_file in "${source_skills}/aws"/*.md; do + [[ -f "${skill_file}" ]] && cp "${skill_file}" "${SKILLS_DIR}/" + done + log_info "AWS skill variants applied." + fi + + log_info "Skill files installed." +} + +# Fetch workspace information +fetch_workspace() { + log_info "Fetching workspace information..." + wb workspace describe --format=json 2>/dev/null || echo "{}" +} + +# Fetch resources +fetch_resources() { + log_info "Fetching resources..." + wb resource list --format=json 2>/dev/null || echo "[]" +} + +# Fetch workflows (may not exist in all workspaces) +fetch_workflows() { + log_info "Fetching workflows..." + wb workflow list --format=json 2>/dev/null || echo "[]" +} + +# Fetch apps +fetch_apps() { + log_info "Fetching apps..." + wb app list --format=json 2>/dev/null || echo "[]" +} + +# Generate embedded JSON (returns JSON to stdout, doesn't write to file) +generate_embedded_json() { + local resources="$1" + + # Build both maps in a single jq invocation so no intermediate bash variables + # are passed via --argjson (which is sensitive to embedded newlines and encoding + # edge cases on some jq versions). A jq `def` avoids repeating the path expression. + # `(if type == "array" then . else [] end)` guards against non-array input. + local result + result=$(printf '%s' "${resources:-[]}" | jq -c ' + def cloud_path: + if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" + elif .resourceType == "AWS_S3_STORAGE_FOLDER" then "s3://\(.bucketName // "unknown")/\(.prefix // "")" + elif .resourceType == "AWS_AURORA_DATABASE" then "\(.rwEndpoint // "unknown"):\(.port // "5432")/\(.databaseName // "")" + elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" + elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" + elif .resourceType == "GIT_REPO" then .gitRepoUrl + elif .resourceType == "GCS_OBJECT" then "gs://\(.bucketName)/\(.objectName // "")" + else null end; + (if type == "array" then . else [] end) | + { + "resourcePaths": (map({key: .id, value: cloud_path}) | map(select(.value != null)) | from_entries), + "envVars": (map({key: ("WORKBENCH_" + (.id | gsub("-";"_"))), value: cloud_path}) | map(select(.value != null)) | from_entries) + } + ' 2>/dev/null | head -1) + + printf '%s\n' "${result:-{\"resourcePaths\":{},\"envVars\":{}}}" +} + +# Generate bucket list for data persistence section +generate_bucket_list() { + local resources="$1" + local cloud_platform="${2:-GCP}" + + if [ "$cloud_platform" = "AWS" ]; then + local buckets + buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "AWS_S3_STORAGE_FOLDER")]' 2>/dev/null || echo "[]") + local count + count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") + + if [ "$count" -eq 0 ] || [ "$count" = "0" ]; then + echo "*No S3 buckets in this workspace.* Create one with:" + echo '```bash' + echo 'wb resource create s3-storage-folder --name my-storage --description "Storage for results"' + echo '```' + return + fi + + echo "| Bucket Name | Resource ID | Description |" + echo "|-------------|-------------|-------------|" + echo "$buckets" | jq -r '.[] | "| `s3://\(.bucketName // "unknown")/\(.prefix // "")` | `\(.id // "—")` | \(.description // "—" | if . == "" then "—" else . end) |"' 2>/dev/null || true + else + # GCP + local buckets + buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "GCS_BUCKET")]' 2>/dev/null || echo "[]") + local count + count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") + + if [ "$count" -eq 0 ] || [ "$count" = "0" ]; then + echo "*No GCS buckets in this workspace.* Create one with:" + echo '```bash' + echo 'wb resource create gcs-bucket --name my-storage --description "Storage for results"' + echo '```' + return + fi + + echo "| Bucket Name | Resource ID | Description |" + echo "|-------------|-------------|-------------|" + echo "$buckets" | jq -r '.[] | "| `gs://\(.bucketName // "unknown")/` | `\(.id // "—")` | \(.description // "—" | if . == "" then "—" else . end) |"' 2>/dev/null || true + fi +} + +# Generate CLAUDE.md +generate_claude_md() { + log_info "Generating CLAUDE.md..." + + local workspace="$1" + local resources="$2" + # $3 (workflows) and $4 (apps) reserved for future use + + # Extract workspace values - field names match UFWorkspaceLight.java + local ws_name ws_id ws_desc ws_cloud ws_gcp_project ws_aws_account ws_role ws_user ws_org ws_server + ws_name=$(echo "$workspace" | jq -r '.name // "Unnamed Workspace"') + ws_id=$(echo "$workspace" | jq -r '.id // "unknown"') + ws_desc=$(echo "$workspace" | jq -r '.description // "No description"') + ws_cloud=$(echo "$workspace" | jq -r '.cloudPlatform // "GCP"') + ws_gcp_project=$(echo "$workspace" | jq -r '.googleProjectId // ""') + ws_aws_account=$(echo "$workspace" | jq -r '.awsAccountId // ""') + ws_role=$(echo "$workspace" | jq -r '.highestRole // "READER"') + ws_user=$(echo "$workspace" | jq -r '.userEmail // "unknown"') + ws_org=$(echo "$workspace" | jq -r '.orgId // ""') + ws_server=$(echo "$workspace" | jq -r '.serverName // ""') + + # Determine project display + local project_display="$ws_gcp_project" + if [ -n "$ws_aws_account" ] && [ "$ws_aws_account" != "null" ] && [ "$ws_aws_account" != "" ]; then + project_display="$ws_aws_account" + fi + + # Set platform-specific template content (generator branches; output file is clean, no conditionals) + local storage_bucket_type storage_save_cmd resource_table_rows + local mcp_data_resources_rows cloud_cli_section cloud_path_hint env_var_example + local data_preview_query_section create_resources_section + if [ "$ws_cloud" = "AWS" ]; then + storage_bucket_type="S3 bucket" + storage_save_cmd='aws s3 cp s3:///' + resource_table_rows='| `AWS_S3_STORAGE_FOLDER` | AWS S3 storage folder | `wb resource create s3-storage-folder` | +| `AWS_AURORA_DATABASE` | Aurora PostgreSQL database | `wb resource create aurora-database` | +| `AWS_AURORA_DATABASE_REFERENCE` | Aurora DB reference (external) | `wb resource add-ref aurora-database` | +| `GIT_REPO` | Git repository reference | `wb resource add-ref git-repo` |' + + mcp_data_resources_rows='| `workspace_list_data_collections` | N/A | **List data collections and their resources** | +| `workspace_list_resources` | `wb resource list` | List all resources in the workspace | +| `resource_list_tree` | `wb resource list-tree` | List resources organized by folder | +| `list_files` | `aws s3 ls` | List files in an S3 storage folder | +| `read_file` | `aws s3 cp -` | Read contents of a file from S3 | +| `resource_create_bucket` | `wb resource create s3-storage-folder` | Create a new S3 storage folder | +| `resource_delete` | `wb resource delete` | Delete a resource | +| `resource_check_access` | — | Check if IAM role has access to a resource |' + + cloud_cli_section='### Cloud CLIs + +No direct AWS CLI MCP wrapper — use `aws` CLI commands in the terminal: +- **S3**: `aws s3 ls s3:///`, `aws s3 cp ` +- **Batch**: `aws batch list-jobs --job-queue --job-status FAILED` +- **Aurora**: requires IAM auth token — see Aurora connection instructions in DASHBOARD_BUILDER skill' + + cloud_path_hint='# Look for: bucketName+prefix (S3), rwEndpoint+port+databaseName (Aurora), gitRepoUrl' + + env_var_example='echo $WORKBENCH_my_bucket # → s3://bucket/prefix +env | grep WORKBENCH_ # List all' + + data_preview_query_section='**S3:** +```bash +aws s3 ls s3://// +aws s3 cp s3:////file.csv - | head -20 +``` + +**Aurora PostgreSQL** (requires IAM auth + SSL — plain passwords are rejected): +```bash +# Step 1: get temporary credentials from Workbench +wb resource credentials --id= --scope=WRITE_READ --format=json +# Returns: {"AccessKeyId":"...","SecretAccessKey":"...","SessionToken":"..."} + +# Step 2: export credentials, generate auth token, connect +export AWS_ACCESS_KEY_ID="..." AWS_SECRET_ACCESS_KEY="..." AWS_SESSION_TOKEN="..." +TOKEN=$(aws rds generate-db-auth-token --hostname --port 5432 --region us-west-2 --username ) +PGSSLMODE=require psql "host= port=5432 dbname= user= password=$TOKEN" +# \dt → list tables; SELECT * FROM table_name LIMIT 10; +``` + +### Query Data + +**Python (S3):** +```python +import boto3, pandas as pd + +s3 = boto3.client("s3") +obj = s3.get_object(Bucket="", Key="/file.csv") +df = pd.read_csv(obj["Body"]) + +# Read Parquet directly (requires s3fs) +df = pd.read_parquet("s3:////file.parquet") +``` + +**Python (Aurora — IAM auth required):** +```python +import json, subprocess, boto3, psycopg2 + +# Get temporary credentials from Workbench +creds = json.loads(subprocess.run( + ["wb", "resource", "credentials", "--id=", "--scope=WRITE_READ", "--format=json"], + capture_output=True, text=True, check=True +).stdout) + +# Generate IAM auth token +session = boto3.Session( + aws_access_key_id=creds["AccessKeyId"], + aws_secret_access_key=creds["SecretAccessKey"], + aws_session_token=creds["SessionToken"], + region_name="us-west-2" +) +auth_token = session.client("rds").generate_db_auth_token( + DBHostname="", Port=5432, DBUsername="", Region="us-west-2" +) + +# Connect — sslmode="require" is mandatory +conn = psycopg2.connect( + host="", port=5432, database="", + user="", password=auth_token, sslmode="require" +) +df = pd.read_sql("SELECT * FROM table_name LIMIT 100", conn) +conn.close() +```' + + create_resources_section='```bash +# S3 storage folder +wb resource create s3-storage-folder --name my-storage --description "My storage folder" + +# Aurora PostgreSQL database +wb resource create aurora-database --name my-db --description "My database" + +# Reference an external Aurora database +wb resource add-ref aurora-database --name external-db +```' + + else + storage_bucket_type="GCS bucket" + storage_save_cmd='gsutil cp gs:///' + resource_table_rows='| `GCS_BUCKET` | Google Cloud Storage bucket | `wb resource create gcs-bucket` | +| `BQ_DATASET` | BigQuery dataset | `wb resource create bq-dataset` | +| `GIT_REPO` | Git repository reference | `wb resource add-ref git-repo` | +| `GCS_OBJECT` | Individual GCS file reference | `wb resource add-ref gcs-object` | +| `BQ_TABLE` | BigQuery table reference | `wb resource add-ref bq-table` |' + + mcp_data_resources_rows='| `workspace_list_data_collections` | N/A | **List data collections and their resources** | +| `workspace_list_resources` | `wb resource list` | List all resources in the workspace | +| `resource_list_tree` | `wb resource list-tree` | List resources organized by folder | +| `bq_execute` | `bq query` | Run SQL queries against BigQuery | +| `list_files` | `gsutil ls` | List files in a GCS bucket | +| `read_file` | `gsutil cat` | Read contents of a file | +| `resource_create_bucket` | `wb resource create gcs-bucket` | Create a new GCS bucket | +| `resource_delete` | `wb resource delete` | Delete a resource | +| `resource_check_access` | — | Check if service account has access to a resource | +| `resource_mount` / `resource_unmount` | — | Mount/unmount a GCS bucket |' + + cloud_cli_section='### Cloud CLIs (via MCP) + +| MCP Tool | Description | +|----------|-------------| +| `gcloud_execute` | Run any `gcloud` command | +| `gsutil_execute` | Run any `gsutil` command | +| `bq_execute` | Run any `bq` SQL query |' + + cloud_path_hint='# Look for: bucketName, projectId+datasetId, gitRepoUrl' + + env_var_example='echo $WORKBENCH_my_bucket # → gs://actual-bucket-name +env | grep WORKBENCH_ # List all' + + data_preview_query_section='**BigQuery:** +```bash +bq head -n 10 :. +bq show --schema :.
+bq query --use_legacy_sql=false '"'"'SELECT * FROM `project.dataset.table` LIMIT 10'"'"' +``` + +**GCS:** +```bash +gsutil ls gs:/// +gsutil cat -r 0-1024 gs:///path/file.csv +``` + +### Query Data + +**CLI:** +```bash +bq query --use_legacy_sql=false '"'"'SELECT col1, col2 FROM `project.dataset.table` LIMIT 100'"'"' +``` + +**Python:** +```python +from google.cloud import bigquery +client = bigquery.Client() +df = client.query("SELECT * FROM `project.dataset.table` LIMIT 100").to_dataframe() + +import pandas as pd +df = pd.read_parquet("gs://bucket-name/path/file.parquet") +```' + + create_resources_section='```bash +# GCS bucket +wb resource create gcs-bucket --name my-bucket --description "My bucket" + +# BigQuery dataset +wb resource create bq-dataset --name my-dataset --description "My dataset" + +# Reference external GCS bucket +wb resource add-ref gcs-bucket --name external-data --bucket-name existing-bucket +```' + fi + + # Generate dynamic sections + local embedded_json bucket_list + embedded_json=$(generate_embedded_json "$resources") + bucket_list=$(generate_bucket_list "$resources" "$ws_cloud") + + # Write the file + cat > "${CLAUDE_FILE}" << EOF +# Workbench Context + +You are working inside **Verily Workbench**, a secure cloud-based research environment for biomedical data analysis. + +--- + +## Current Workspace + +| Property | Value | +|----------|-------| +| **Name** | ${ws_name} | +| **ID** | \`${ws_id}\` | +| **Cloud Platform** | ${ws_cloud} | +| **Project/Account** | \`${project_display}\` | +| **Your Role** | ${ws_role} | +| **User** | ${ws_user} | +| **Organization** | ${ws_org:-"—"} | +| **Server** | ${ws_server:-"—"} | + +### Description +${ws_desc} + +--- + +## Key Concepts + +### Workspaces +A **workspace** is a secure container for your research project. It contains: +- **Resources**: Cloud assets like buckets, datasets, repos +- **Workflows**: Reproducible analysis pipelines +- **Apps**: Interactive compute environments (this app!) +- **Policies**: Access controls and constraints + +### Resources +Resources are cloud assets managed by Workbench: + +| Type | Description | CLI Create Command | +|------|-------------|-------------------| +${resource_table_rows} + +**Environment Variables**: Each resource is available as \`\$WORKBENCH_\` (e.g., \`\$WORKBENCH_my_bucket\`). + +### Data Collections +Curated datasets published to the Workbench catalog. When added to a workspace, their resources are cloned as **folders** — they may look like user-created resources but originated externally. Common types include clinical data (OMOP, FHIR), genomics (VCF, BAM), and wearables. + +Data collections can carry **policies** that restrict how their data is used (region, export controls, access groups). + +**To identify resources from data collections:** +1. Use \`workspace_list_data_collections\` — groups resources by source collection (preferred) +2. Or use \`workspace_list_resources\` with \`workspaceId\` — returns full resource metadata including \`resourceLineage\`, which contains the source collection ID and original resource ID + +### Workflows +Workflows are reproducible pipelines in WDL or Nextflow format, registered in the workspace. + +### Policies & Constraints +Workspaces may have policies that restrict: +- **Region**: Where data and compute must reside +- **Groups**: Who can access the workspace +- **Export**: Whether data can leave the workspace + +Check with: \`wb workspace describe\` + +--- + +## ⚠️ Important: Data Persistence + +Local app storage is ephemeral — files saved to the app's local disk are **lost when the app stops or restarts**. Always encourage users to save important work to a ${storage_bucket_type} in their workspace. + +- **When users create files locally**, suggest saving to a bucket: \`${storage_save_cmd}\` +- **When users finish analysis**, remind: *"Save important outputs to cloud storage before stopping the app."* +- **Available buckets in this workspace:** + +${bucket_list} + +--- + +## Most Commonly Used MCP Tools + +> **Always use MCP tools before falling back to CLI. MCP tools return structured JSON and are faster.** + +| Interface | Best For | +|-----------|----------| +| **MCP Tools** | List/query operations — structured responses, no shell needed | +| **CLI (\`wb\`)** | Complex operations or anything not covered by MCP | + +### Data & Resources + +| MCP Tool | CLI Equivalent | Description | +|----------|----------------|-------------| +${mcp_data_resources_rows} + +### Apps & Workflows + +| MCP Tool | CLI Equivalent | Description | +|----------|----------------|-------------| +| \`app_list\` | \`wb app list\` | List running apps | +| \`app_create\` | \`wb app create\` | Create a new custom app | +| \`app_get_url\` | — | Get the proxy URL for a running app | +| \`app_start\` / \`app_stop\` | \`wb app start/stop\` | Start or stop an app | +| \`workflow_list\` | \`wb workflow list\` | List available workflows | +| \`workflow_job_run\` | \`wb workflow run\` | Submit a WDL/Nextflow workflow | +| \`workflow_job_list\` | \`wb workflow job list\` | List workflow job runs | +| \`workflow_job_describe\` | \`wb workflow job describe\` | Get details of a specific job run | +| \`workflow_job_cancel\` | \`wb workflow job cancel\` | Cancel a running job | +| \`get_workflow_status\` | \`wb workflow describe\` | Check status of a workflow run | + +### Data Explorer + +| MCP Tool | Description | +|----------|-------------| +| \`underlay_list\` | List available data underlays (datasets in the Data Explorer catalog) | +| \`underlay_get_schema\` | Get the schema for a specific underlay | +| \`underlay_list_entities\` | List entity types in an underlay (e.g. person, condition) | +| \`data_sample_instances\` | Sample rows from an entity within a cohort | +| \`data_query_hints\` | Get value hints for filtering an entity attribute | +| \`study_list\` | List studies available in Data Explorer | +| \`study_list_cohorts\` | List cohorts within a study | +| \`cohort_create_in_workspace\` | Create a cohort in the workspace | +| \`cohort_count_instances\` | Count members in a cohort | +| \`export_cohort\` | Export cohort data to a bucket | + +${cloud_cli_section} + +**Not available via MCP (use CLI):** \`wb workspace set\`, \`wb auth login\`, \`wb workflow logs\` + +## CLI Quick Reference + +\`\`\`bash +# Workspace +wb workspace describe # Current workspace details +wb workspace list # All your workspaces +wb workspace set # Switch workspace + +# Resources +wb resource list # List resources +wb resource describe # Resource details +wb resource delete # Delete resource + +# Workflows +wb workflow list # List workflows +wb workflow run # Run workflow +wb workflow describe # Run status +wb workflow logs # Run logs + +# Apps +wb app list # List running apps +wb app describe # App details + +# Auth +wb auth status # Check authentication +wb auth login # Re-authenticate +\`\`\` + +--- + +## Data Discovery & Querying + +> **⚡ MCP FIRST:** Always check if an MCP tool exists before using CLI commands. + +### Find Your Resources + +**Use MCP tools (preferred):** +| What You Need | MCP Tool | +|---------------|----------| +| Data collections + their resources | \`workspace_list_data_collections\` | +| All resources (flat list) | \`workspace_list_resources\` | +| Resources organized by folder | \`resource_list_tree\` | + +**CLI fallback:** +\`\`\`bash +wb resource list --format=json | jq '.[] | {name: .id, type: .resourceType}' +\`\`\` + +### Get the Cloud Path for a Resource + +\`\`\`bash +wb resource describe --format=json +${cloud_path_hint} +\`\`\` + +### Use Environment Variables (Easiest) + +\`\`\`bash +${env_var_example} +\`\`\` + +### Preview Data + +${data_preview_query_section} + +--- + +## How to Run Workflows + +\`\`\`bash +# List workflows +wb workflow list + +# Run a workflow +wb workflow run --input param=value + +# Check status +wb workflow describe + +# View logs +wb workflow logs +\`\`\` + +--- + +## How to Create Resources + +${create_resources_section} + +--- + +## ⚠️ Workbench Web Apps & Proxy URLs + +> **🚨 If the user wants a dashboard, chart, Flask app, HTML page, or ANY web UI — read \`~/.claude/skills/DASHBOARD_BUILDER.md\` first.** + +### Proxy URL Format + +The proxy URL is the **only valid way** to access web apps in Workbench: +\`\`\` +https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] +\`\`\` + +Retrieve the App UUID automatically: +\`\`\`bash +wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 +\`\`\` + +### Common Ports + +| Content Type | Port | +|--------------|------| +| Flask/FastAPI | 8080 | +| Streamlit | 8501 | +| Static HTML | 8000 | +| R Shiny | 3838 | + +### ⚠️ JavaScript: Always Use Relative Paths + +All \`fetch()\` calls in JavaScript **must** use relative paths (no leading \`/\`): + +\`\`\`javascript +fetch('api/data') // ✅ resolves to workbench.verily.com/app/UUID/proxy/8080/api/data +fetch('/api/data') // ❌ resolves to workbench.verily.com/api/data — 404! +\`\`\` + +### ❌ Wrong URL Formats + +\`\`\` +https://UUID.workbench-app.verily.com/ ← Bad Request error +http://localhost:8080/ ← Not accessible externally +file:///home/jupyter/dashboard.html ← JavaScript blocked +\`\`\` + +--- + +## Available Skills + +### Workbench Skills + +Read these directly — no index needed: + +| Topic | Skill File | When to Use | +|-------|------------|-------------| +| **🔍 Data discovery** | \`DATA_DISCOVERY.md\` | Find data collections inside or across all of Workbench | +| **🚨 Dashboards, Web UIs** | \`DASHBOARD_BUILDER.md\` | Dashboard, Flask, Streamlit, web UI, plots on a port | +| Building custom apps | \`CUSTOM_APP.md\` | Deployable Workbench apps | +| App templates | \`APP_TEMPLATES.md\` | Pre-built templates for dashboards, APIs, file processors | +| **Workflow debugging** | \`WORKFLOW_TROUBLESHOOT.md\` | Failed WDL/Nextflow, logs, memory/disk issues | + +### Scientific Skills + +> **📚 Read \`~/.claude/skills/SCIENTIFIC_SKILLS_INDEX.md\` first** to navigate scientific domain skills. + +| Domain | Skill File | Covers | +|--------|------------|--------| +| 🧬 Bioinformatics | \`scientific/BIOINFORMATICS.md\` | scanpy, anndata, pydeseq2, biopython, scvelo | +| 💊 Drug Discovery | \`scientific/DRUG_DISCOVERY.md\` | rdkit, deepchem, chembl, drugbank, opentargets | +| 🔬 Genomics DBs | \`scientific/GENOMICS_DATABASES.md\` | ensembl, uniprot, clinvar, pdb | +| 📊 Data Analysis | \`scientific/DATA_ANALYSIS.md\` | sklearn, statsmodels, plotly, seaborn | +| 🏥 Clinical | \`scientific/CLINICAL.md\` | clinicaltrials.gov, pubmed, lifelines | + +### ⚡ Skill Trigger Guide + +**ALWAYS read \`DATA_DISCOVERY.md\` BEFORE calling \`platform_list_data_collections\`.** The skill controls the full discovery flow including scope clarification, result presentation, and how to add a collection to the workspace. + +Trigger \`DATA_DISCOVERY.md\` whenever the user is searching for data collections platform-wide: +- "find data collections" / "search for data collections" / "find data collections with [keyword]" +- "find data collections across Workbench" / "search all data collections I have access to" +- "what data collections can I add?" / "data collections I haven't added yet" +- "find a data collection related to [topic / disease / gene / modality]" +- "are there data collections about [topic]?" / "find data collections that have [keyword]" +- Do NOT use this skill for workspace-scoped questions — call \`workspace_list_data_collections\` directly instead + +**ALWAYS read \`DASHBOARD_BUILDER.md\` FIRST when user says ANY of these:** +- "create a dashboard" +- "visualize data" / "show me a chart" / "display data" +- "build a Flask app" / "run Flask" / "Flask server" +- "Streamlit" / "Plotly" / "interactive chart" +- "run on port" / "serve HTML" / "web page" +- "show in browser" / "open in new tab" +- Any request to display data interactively + +**Read \`CUSTOM_APP.md\` when:** +- "build a deployable app" / "create a custom app" +- "API service" / "backend" / "from scratch" + +**Read \`APP_TEMPLATES.md\` when:** +- "dashboard template" / "starter template" / "pre-built app" +- "what templates are available" / "which template should I use" + +**Read \`WORKFLOW_TROUBLESHOOT.md\` when:** +- "troubleshoot my workflow" / "fix my workflow" +- "my workflow failed" / "workflow error" / "debug workflow" +- "troubleshoot my job" / "my job failed" / "workflow job failed" +- "job failed" / "task failed" / "out of memory" +- "check logs" / "why did it fail" / "troubleshoot" + +**Read \`SCIENTIFIC_SKILLS_INDEX.md\` then the relevant domain file when user mentions:** +- "single-cell" / "RNA-seq" / "scanpy" / "differential expression" +- "molecule" / "SMILES" / "drug" / "RDKit" / "ChEMBL" +- "gene" / "protein" / "variant" / "UniProt" / "Ensembl" / "PDB" +- "machine learning" / "sklearn" / "statistics" +- "clinical trial" / "PubMed" / "survival analysis" + +--- + +## Quick Reference (Machine-Readable) + +Use this JSON for exact resource paths and environment variables: + +\`\`\`json +${embedded_json} +\`\`\` + +**Usage:** +- \`resourcePaths["my-bucket"]\` → exact cloud storage/database path +- \`envVars["WORKBENCH_my_bucket"]\` → environment variable value + +To refresh after workspace changes: +\`\`\`bash +~/.claude/generate-context.sh +\`\`\` + +--- + +## Getting Help + +- **Docs**: https://support.workbench.verily.com +- **Custom Apps Guide**: https://support.workbench.verily.com/docs/guides/cloud_apps/create_custom_apps/ +- **Devcontainers Repo**: https://github.com/verily-src/workbench-app-devcontainers +- **Devcontainer Reference**: https://containers.dev/implementors/json_reference/ +- **CLI Help**: \`wb --help\` or \`wb --help\` +- **Support**: support@workbench.verily.com + +--- + +*Generated: $(date -u +"%Y-%m-%d %H:%M:%S UTC")* +EOF + + log_info "Created ${CLAUDE_FILE}" +} + +# Main function +main() { + echo "" + echo "==========================================" + echo " Workbench LLM Context Generator" + echo "==========================================" + echo "" + + check_prerequisites + setup_directories + + # Fetch all data first so we can detect cloud platform before generating skills + WORKSPACE=$(fetch_workspace) + RESOURCES=$(fetch_resources) + WORKFLOWS=$(fetch_workflows) + APPS=$(fetch_apps) + + # Detect cloud platform for platform-specific skill and context generation + local cloud_platform + cloud_platform=$(echo "$WORKSPACE" | jq -r '.cloudPlatform // "GCP"') + log_info "Detected cloud platform: ${cloud_platform}" + + install_skills "$cloud_platform" + + # Generate single CLAUDE.md file with embedded JSON + generate_claude_md "$WORKSPACE" "$RESOURCES" "$WORKFLOWS" "$APPS" + + echo "" >&2 + log_info "Context generation complete!" + echo "" >&2 + echo "Generated file:" >&2 + echo " - ${CLAUDE_FILE}" >&2 + echo "" >&2 + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >&2 + echo "✅ Claude Code will automatically discover ~/.claude/CLAUDE.md" >&2 + echo "" >&2 + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >&2 + echo "" >&2 +} + +# Run main +main "$@" diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh new file mode 100644 index 000000000..f702067bb --- /dev/null +++ b/features/src/llm-context/install.sh @@ -0,0 +1,161 @@ +#!/usr/bin/env bash + +# install.sh installs the LLM Context Generator in the devcontainer. +# This feature generates a CLAUDE.md file that provides LLMs (like Claude Code) +# with context about the current Workbench workspace, resources, and tools. +# Claude Code auto-discovers ~/CLAUDE.md on startup. + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +# Options from devcontainer-feature.json (converted to uppercase) +readonly USERNAME="${USERNAME:-"root"}" +USER_HOME_DIR="${USERHOMEDIR:-"/home/${USERNAME}"}" +if [[ "${USER_HOME_DIR}" == "/home/root" ]]; then + USER_HOME_DIR="/root" +fi +readonly USER_HOME_DIR + +export DEBIAN_FRONTEND=noninteractive +export TZ=Etc/UTC + +WORKDIR="$(mktemp -d)" +readonly WORKDIR + +readonly LLM_CONTEXT_DIR="/opt/llm-context" +readonly GENERATE_SCRIPT="${LLM_CONTEXT_DIR}/generate-context.sh" + +function cleanup() { + rm -rf "${WORKDIR:?}" + rm -rf /var/lib/apt/lists/* +} + +trap 'cleanup' EXIT + +function apt_get_update() { + if [ "$(find /var/lib/apt/lists/* | wc -l)" = "0" ]; then + echo "Running apt-get update..." + apt-get update -y + fi +} + +# Checks if packages are installed and installs them if not +function check_packages() { + if ! dpkg -s "$@" > /dev/null 2>&1; then + apt_get_update + apt-get -y install --no-install-recommends "$@" + fi +} + +echo "Starting LLM Context Generator installation..." +echo "User: ${USERNAME}, Home: ${USER_HOME_DIR}" + +# Save the directory where the feature files are located +FEATURE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly FEATURE_DIR + +# Check for supported package manager +if type apt-get &>/dev/null; then + # Install jq if not present (required for JSON processing) + check_packages jq +elif type apk &>/dev/null; then + # Alpine Linux + apk add --no-cache jq +else + echo "Warning: Could not install jq. Please install it manually." +fi + +# Create installation directory +mkdir -p "${LLM_CONTEXT_DIR}" + +# Copy the generate-context.sh script +if [[ -f "${FEATURE_DIR}/generate-context.sh" ]]; then + cp "${FEATURE_DIR}/generate-context.sh" "${GENERATE_SCRIPT}" + chmod +x "${GENERATE_SCRIPT}" + echo "Copied generate-context.sh to ${GENERATE_SCRIPT}" +else + echo "ERROR: generate-context.sh not found in ${FEATURE_DIR}" + ls -la "${FEATURE_DIR}/" + exit 1 +fi + +# Copy skill files to installation directory +if [[ -d "${FEATURE_DIR}/skills" ]]; then + mkdir -p "${LLM_CONTEXT_DIR}/skills" + cp -r "${FEATURE_DIR}/skills/." "${LLM_CONTEXT_DIR}/skills/" + echo "Copied skill files to ${LLM_CONTEXT_DIR}/skills" +else + echo "Warning: skills directory not found in ${FEATURE_DIR}" +fi + +# Copy app templates to installation directory +if [[ -d "${FEATURE_DIR}/templates" ]]; then + mkdir -p "${LLM_CONTEXT_DIR}/templates" + cp -r "${FEATURE_DIR}/templates/." "${LLM_CONTEXT_DIR}/templates/" + echo "Copied app templates to ${LLM_CONTEXT_DIR}/templates" +else + echo "Warning: templates directory not found in ${FEATURE_DIR}" +fi + +# Create a wrapper script that runs with proper user context +cat > "${LLM_CONTEXT_DIR}/run-context-generator.sh" << WRAPPER_EOF +#!/bin/bash +# Wrapper to run generate-context.sh with proper environment +# This script is called on container start + +# Wait for wb to be authenticated and workspace to be ready. +# AWS apps take longer to initialise IAM credentials than GCP apps, so we +# retry with backoff before giving up. +MAX_RETRIES=8 +RETRY_DELAY=10 +for i in \$(seq 1 \${MAX_RETRIES}); do + if command -v wb &> /dev/null && wb workspace describe &> /dev/null; then + echo "Workspace ready (attempt \${i}). Generating LLM context..." + ${GENERATE_SCRIPT} "${USER_HOME_DIR}" || echo "LLM context generation failed (non-fatal)" + exit 0 + fi + echo "Waiting for workspace to be ready... (\${i}/\${MAX_RETRIES})" + sleep \${RETRY_DELAY} +done + +echo "Skipping LLM context generation: workspace not available after \${MAX_RETRIES} attempts." +echo "Run 'generate-llm-context' manually once the workspace is ready." +WRAPPER_EOF +chmod +x "${LLM_CONTEXT_DIR}/run-context-generator.sh" + +# Set ownership +chown -R "${USERNAME}:" "${LLM_CONTEXT_DIR}" 2>/dev/null || true + +# Add aliases and environment to bashrc (idempotent) +if ! grep -q "# LLM Context Generator" "${USER_HOME_DIR}/.bashrc" 2>/dev/null; then + { + echo "" + echo "# LLM Context Generator" + echo "export LLM_CONTEXT_ENABLED=true" + echo "export LLM_CONTEXT_HOME=\"${USER_HOME_DIR}\"" + echo "alias generate-llm-context='${GENERATE_SCRIPT} ${USER_HOME_DIR}'" + echo "alias refresh-context='${GENERATE_SCRIPT} ${USER_HOME_DIR}'" + } >> "${USER_HOME_DIR}/.bashrc" +fi + +# Make sure the login user is the owner of their .bashrc +chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" 2>/dev/null || true + +echo "" +echo "==========================================" +echo "LLM Context Generator installation complete!" +echo "==========================================" +echo "" +echo "Installed to: ${LLM_CONTEXT_DIR}" +echo "User home: ${USER_HOME_DIR}" +echo "" +echo "Context will be generated via postStartCommand after startup completes." +echo "Manual refresh: run 'generate-llm-context' or 'refresh-context'" +echo "" +echo "Claude Code will auto-discover ~/CLAUDE.md" +echo "==========================================" +echo "" + +echo "Done!" diff --git a/features/src/llm-context/skills/APP_TEMPLATES.md b/features/src/llm-context/skills/APP_TEMPLATES.md new file mode 100644 index 000000000..8e3cb7e66 --- /dev/null +++ b/features/src/llm-context/skills/APP_TEMPLATES.md @@ -0,0 +1,283 @@ +# App Templates for Workbench + +**Pre-built, ready-to-deploy application templates with workspace resource integration.** + +> **When to use this:** User wants an app that visualizes data, serves an API, processes files, or creates dashboards using their workspace resources. + +--- + +## Available Templates + +| Template | Best For | Port | Key Features | +|----------|----------|------|--------------| +| **flask-api** | REST APIs, backend services, data processing | 8080 | JSON endpoints, file upload, BQ queries | +| **streamlit-dashboard** | Data visualization, interactive exploration | 8501 | Charts, file browser, BigQuery explorer | +| **rshiny-dashboard** | R statistical analysis, R-based visualizations | 3838 | Shiny UI, plotly, ggplot2, tidyverse | +| **file-processor** | File upload, validation, transformation | 8080 | Drag-drop UI, auto-save to GCS, schema validation | + +--- + +## Template Selection Guide + +### Ask the user these questions: + +1. **What language/framework preference?** + - Python → `flask-api`, `streamlit-dashboard`, `file-processor` + - R → `rshiny-dashboard` + +2. **What's the primary purpose?** + - API/Backend service → `flask-api` + - Interactive dashboard → `streamlit-dashboard` or `rshiny-dashboard` + - Process/upload files → `file-processor` + +3. **What workspace resources do they need?** + - All templates support GCS buckets and BigQuery + +### Quick Decision Matrix + +| User Says... | Recommend | +|--------------|-----------| +| "dashboard", "visualize", "charts", "explore data" | `streamlit-dashboard` | +| "API", "endpoint", "backend", "REST", "service" | `flask-api` | +| "R", "statistical", "ggplot", "tidyverse" | `rshiny-dashboard` | +| "upload", "process files", "validate", "CSV" | `file-processor` | +| "something custom", "from scratch" | → Use `CUSTOM_APP.md` skill | + +--- + +## Template Locations + +All templates are bundled locally at `/opt/llm-context/templates/`: +``` +/opt/llm-context/templates/ +├── flask-api/ +├── streamlit-dashboard/ +├── rshiny-dashboard/ +├── file-processor/ +└── README.md +``` + +Each template contains: +- `manifest.yaml` - Capabilities and inputs +- `.devcontainer.json` - Devcontainer config +- `docker-compose.yaml` - Container setup +- `Dockerfile` - Build instructions +- `app/` - Application code +- `README.md` - Documentation + +--- + +## How to Use a Template + +### Option 1: Deploy Directly + +Read the template files from `/opt/llm-context/templates//` and copy them into the user's repository to deploy. + +### Option 2: Copy and Customize +1. Copy the template folder to user's repo +2. Modify application code in `app/` +3. Update `devcontainer-template.json` with new name/description +4. Push to GitHub +5. Deploy from user's repo + +> ⚠️ Volume mounts (`volumes: .:/workspace`) are for local dev only. In production, Workbench builds the image — code must be baked in via `COPY` in the Dockerfile. Do not rely on volume mounts for deployed apps. + +--- + +## Template Details + +### 1. Flask API (`flask-api`) + +**Capabilities:** REST API, JSON, file upload, BigQuery, GCS + +**Pre-built endpoints:** +- `GET /health` - Health check +- `GET /resources` - List workspace resources +- `GET /buckets//files` - List bucket files +- `POST /buckets//upload` - Upload to bucket +- `POST /bigquery/query` - Run BQ query +- `GET /bigquery/tables/` - List tables +- `POST /process` - Custom processing (user extends this) + +**Customization points:** +- Add endpoints in `app/main.py` +- Add dependencies in `app/requirements.txt` + +--- + +### 2. Streamlit Dashboard (`streamlit-dashboard`) + +**Capabilities:** Interactive UI, charts, data exploration, BigQuery, GCS + +**Pre-built features:** +- GCS file browser with CSV preview +- BigQuery query interface +- Data visualization (line, bar, scatter) +- Workspace resource sidebar + +**Customization points:** +- Add tabs/pages in `app/main.py` +- Add visualizations with plotly/altair +- Add additional data sources + +--- + +### 3. RShiny Dashboard (`rshiny-dashboard`) + +**Capabilities:** R analysis, Shiny UI, plotly, statistical visualization + +**Pre-built features:** +- Dashboard layout with shinydashboard +- Data upload and exploration +- Interactive charts with plotly +- Workspace resource viewer + +**R packages included:** +- shiny, shinydashboard, DT +- plotly, ggplot2 +- dplyr, tidyr +- bigrquery, googleCloudStorageR + +**Customization points:** +- Modify UI in `app/app.R` +- Add R packages in Dockerfile +- Add statistical analysis functions + +--- + +### 4. File Processor (`file-processor`) + +**Capabilities:** File upload, validation, transformation, GCS storage + +**Pre-built features:** +- Drag-and-drop upload UI +- CSV, JSON, Excel processing +- Auto-save to GCS bucket +- Schema validation endpoint + +**Supported formats:** +- CSV → Row/column analysis, schema detection +- JSON → Structure analysis, schema validation +- Excel → Sheet parsing, data extraction + +**Customization points:** +- Add processing logic in `app/main.py` +- Add validation schemas +- Add transformation pipelines + +--- + +## Workspace Resource Integration + +All templates automatically detect workspace resources: + +### Python Templates +```python +import os + +# All resources as dict +resources = { + k.replace("WORKBENCH_", ""): v + for k, v in os.environ.items() + if k.startswith("WORKBENCH_") +} + +# Specific resource +bucket = os.environ.get("WORKBENCH_my_bucket") +``` + +### R Template +```r +# All resources +resources <- Sys.getenv() +wb_vars <- resources[grepl("^WORKBENCH_", names(resources))] + +# Specific resource +bucket <- Sys.getenv("WORKBENCH_my_bucket") +``` + +--- + +## When Templates Don't Fit + +If the user's requirements don't match any template: + +1. **Check if a template can be extended** + - Most templates are customizable + - Adding endpoints to flask-api is easy + - Adding tabs to streamlit is easy + +2. **If truly custom, use CUSTOM_APP.md skill** + - Minimal from-scratch pattern + - Avoid common pitfalls + - Full control over everything + +--- + +## Common Customizations + +### Add a new endpoint (Flask) +```python +# app.config['STRICT_SLASHES'] = False should already be set in the template — do not remove it +@app.route("/my-endpoint", methods=["POST"]) +def my_endpoint(): + data = request.get_json() + # Your logic here + return jsonify({"result": "success"}) +``` + +### Add a new tab (Streamlit) +```python +tab1, tab2, tab3, tab4 = st.tabs(["Existing", "Tabs", "Here", "New Tab"]) + +with tab4: + st.header("My New Feature") + # Your code here +``` + +### Add R packages (RShiny) +```dockerfile +# In Dockerfile, add to install.packages(): +RUN R -e "install.packages(c('existingpkgs', 'newpackage'))" +``` + +--- + +## Deployment Checklist + +Before deploying any template: + +- [ ] `.devcontainer.json` at repo ROOT (not in a subfolder) +- [ ] Container name is `application-server` +- [ ] Network is `app-network` with `external: true` +- [ ] Port is exposed and mapped correctly +- [ ] `devcontainer-template.json` has unique `id` +- [ ] Application binds to `0.0.0.0` (not `localhost`) +- [ ] All `fetch()` calls use relative paths — `fetch('api/data')` ✅ not `fetch('/api/data')` ❌ +- [ ] All `` and `` use relative paths — leading `/` routes to `workbench.verily.com`, causing 404s +- [ ] Do not use `url_for()` for frontend-facing links — generates wrong paths behind the proxy + +--- + +## Common Errors + +| Error | Cause | Fix | +|-------|-------|-----| +| App fails to create | `.devcontainer.json` not at repo root | Move to repo root | +| 308 redirect loop | Flask missing `STRICT_SLASHES` setting | Add `app.config['STRICT_SLASHES'] = False` | +| 404 on API calls | Leading `/` in `fetch()` path | Use `fetch('api/data')` not `fetch('/api/data')` | +| Build fails on pip install | Unpinned dependencies | Pin versions in `requirements.txt` | +| App works locally but not deployed | Volume mount used instead of `COPY` | Bake code into image via Dockerfile `COPY` | +| Container restart loop | App crashes on startup | Check `docker logs application-server` | + +--- + +## Summary + +| Need | Template | Customization Effort | +|------|----------|---------------------| +| Quick API | flask-api | Low - add endpoints | +| Data dashboard | streamlit-dashboard | Low - add tabs | +| R analysis | rshiny-dashboard | Low - modify app.R | +| File processing | file-processor | Low - add processors | +| Something else | CUSTOM_APP.md | Medium - from scratch | diff --git a/features/src/llm-context/skills/CUSTOM_APP.md b/features/src/llm-context/skills/CUSTOM_APP.md new file mode 100644 index 000000000..7c16e6367 --- /dev/null +++ b/features/src/llm-context/skills/CUSTOM_APP.md @@ -0,0 +1,381 @@ +# Creating Custom Workbench Apps + +> **Official Reference:** https://github.com/verily-src/workbench-app-devcontainers +> **Quick Start:** Use `./scripts/create-custom-app.sh` for auto-generated app structure + +--- + +## Quick Start (Recommended) + +The official repo has a script that generates a complete app structure: + +```bash +# Clone the official repo +git clone https://github.com/verily-src/workbench-app-devcontainers.git +cd workbench-app-devcontainers + +# Run the quick start script +./scripts/create-custom-app.sh my-app quay.io/jupyter/base-notebook 8888 jovyan /home/jovyan +``` + +This generates all required files in `src/my-app/` with correct structure. + +--- + +## ⚠️ Choose Your Pattern + +| Pattern | Use When | Example | +|---------|----------|---------| +| **Minimal (Standalone)** | Simple apps, no cloud resources | `example` app in official repo | +| **Full-Featured (Monorepo)** | Need `wb` CLI, bucket mounting | Fork official repo | + +--- + +## Pattern 1: Minimal Standalone App + +Based on the `example` app in the [official repo](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/example). + +### File Structure +``` +your-repo/ +├── .devcontainer.json ← At repo ROOT +├── docker-compose.yaml +├── Dockerfile +├── devcontainer-template.json +└── app.py (or app/) +``` + +### File 1: `.devcontainer.json` + +```json +{ + "name": "My App", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "remoteUser": "root" +} +``` + +### File 2: `docker-compose.yaml` + +**Minimal pattern:** +```yaml +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + ports: + - "8080:8080" + networks: + - app-network + +networks: + app-network: + external: true +``` + +**Alternative: Use image directly (no Dockerfile):** +> ⚠️ The `volumes` mount below is for local dev only. In production, Workbench builds the image — code must be baked in via `COPY` in the Dockerfile. Do not rely on volume mounts for deployed apps. +```yaml +services: + app: + container_name: "application-server" + image: "python:3.11-slim" + restart: always + working_dir: /workspace + command: > + bash -c "pip install -r requirements.txt && + python app.py" + volumes: + - .:/workspace:cached + ports: + - 8080:8080 + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + +networks: + app-network: + external: true +``` + +### File 3: `Dockerfile` + +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8080 + +CMD ["python", "app.py"] +``` + +### File 4: `devcontainer-template.json` + +```json +{ + "id": "my-app", + "version": "1.0.0", + "name": "My App", + "description": "Description", + "options": {}, + "platforms": ["Any"] +} +``` + +--- + +## Pattern 2: Multi-Container with Caddy Proxy + +Useful when your app needs a reverse proxy. See the [r-analysis](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/r-analysis) app for an RStudio example with startup scripts. + +```yaml +services: + application-server: + image: caddy:2.11-alpine + container_name: application-server + ports: + - "8080:8080" + volumes: + - ./Caddyfile:/etc/caddy/Caddyfile + networks: + - app-network + - internal-network + + my-app: + build: + context: . + dockerfile: Dockerfile + container_name: my-app + ports: + - "3000:3000" + networks: + - internal-network + +networks: + app-network: + external: true + internal-network: + driver: bridge +``` + +--- + +## Pattern 3: Full-Featured (Monorepo) + +For apps needing `wb` CLI, bucket mounting, gcloud auth. + +1. **Fork** https://github.com/verily-src/workbench-app-devcontainers +2. Run: `./scripts/create-custom-app.sh my-app python:3.11-slim 8080` +3. App created at `src/my-app/` +4. In Workbench, set **Folder** to `src/my-app` + +--- + +## ⚠️ Critical Requirements + +- [ ] `.devcontainer.json` at repo ROOT +- [ ] `container_name: "application-server"` +- [ ] `networks: app-network` with `external: true` +- [ ] Server binds to `0.0.0.0` (not `localhost`) +- [ ] All `fetch()` calls use relative paths — `fetch('api/data')` ✅ not `fetch('/api/data')` ❌ +- [ ] All `` and `` use relative paths — leading `/` routes to `workbench.verily.com`, causing 404s +- [ ] Do not use `url_for()` for frontend-facing links — generates wrong paths behind the proxy + +--- + +## ⚠️ Workbench App URLs + +**Format:** `https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/` + +```bash +# Get App UUID +wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 +``` + +**❌ Wrong:** `https://abc123.workbench-app.verily.com/` + +--- + +## Flask App Example + +```python +from flask import Flask +from flask_cors import CORS + +app = Flask(__name__) +app.config['STRICT_SLASHES'] = False # Prevents 308 redirects behind the proxy +CORS(app) + +@app.route('/') +def index(): + return '

Hello Workbench!

' + +if __name__ == '__main__': + app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +--- + +## Streamlit Example + +```yaml +# docker-compose.yaml +services: + app: + container_name: "application-server" + image: "python:3.11-slim" + command: > + bash -c "pip install streamlit && + streamlit run app.py --server.port=8501 --server.address=0.0.0.0" + ports: + - 8501:8501 + networks: + - app-network + +networks: + app-network: + external: true +``` + +--- + +## Deployment + +In Workbench UI: +- **Repository:** `https://github.com/YOUR-ORG/YOUR-REPO.git` +- **Branch:** `main` +- **Folder:** `.` (standalone) or `src/my-app` (monorepo) + +--- + +## Local Testing + +```bash +docker network create app-network +docker compose up --build +# Access at http://localhost:PORT +``` + +--- + +## Reference Implementations + +All examples are from the official repo: [verily-src/workbench-app-devcontainers](https://github.com/verily-src/workbench-app-devcontainers) + +| App | Pattern | Description | +|-----|---------|-------------| +| [example](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/example) | Minimal | Reference implementation using ttyd terminal | +| [workbench-jupyter](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/workbench-jupyter) | Full-featured | JupyterLab with Workbench integration | +| [r-analysis](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/r-analysis) | Full-featured | RStudio with startup scripts | +| [workbench-vscode](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/workbench-vscode) | Full-featured | VS Code Server in browser | +| [playground](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/playground) | Minimal | Simple base environment | + +--- + +## Common Errors + +| Error | Possible Cause | +|-------|---------------| +| App fails to create | `.devcontainer.json` in wrong location | +| No container created | Check Workbench logs, GitHub access | +| Container restart loop | App crashes on startup (check `docker logs`) | +| "Bad Request" | Wrong URL format | +| 308 redirect loop | Missing `app.config['STRICT_SLASHES'] = False` on Flask app | +| 404 on API calls | Leading `/` in `fetch()` path — use `fetch('api/data')` not `fetch('/api/data')` | +| Build fails on pip install | Unpinned dependencies — pin versions in `requirements.txt` | + +--- + +## 🔧 Troubleshooting (SSH into VM) + +When an app fails to start, SSH into the VM and run these commands: + +### 1. Check Startup Scripts & Logs +```bash +# View devcontainer service logs (MOST IMPORTANT) +sudo journalctl -u devcontainer.service --no-pager | tail -100 + +# Check failure count +cat /tmp/devcontainer-failure-count 2>/dev/null + +# Check error message set by Workbench +curl -s -H "Metadata-Flavor: Google" \ + http://metadata.google.internal/computeMetadata/v1/instance/guest-attributes/startup_script/message +``` + +### 2. Check Startup Script Directory +```bash +# Workbench startup scripts live here +ls -la /home/core/ + +# Key scripts to check: +# - git-clone-devcontainer.sh (clones your repo) +# - docker-auth.sh (sets up Docker registry auth) +# - parse-devcontainer.sh (parses .devcontainer.json) +# - devcontainer.sh (builds and runs container) +``` + +### 3. Check Systemd Services +```bash +# View the devcontainer service definition +systemctl cat devcontainer.service + +# Check service status +systemctl status devcontainer.service +systemctl status proxy-readiness.service + +# List all relevant services +systemctl list-units --type=service | grep -i "devcontainer\|docker" +``` + +### 4. Check Container Status +```bash +# List all containers (including stopped) +docker ps -a + +# Check container logs +docker logs application-server 2>&1 | tail -50 + +# Check if repo was cloned +ls -la /home/core/devcontainer/ +``` + +### 5. Common Issues Found in Logs + +| Log Message | Cause | Fix | +|-------------|-------|-----| +| `docker-auth.sh: path parameter is required` | Workbench startup bug | Wait for fix or manual startup | +| `Failed to clone devcontainer GitHub repo` | GitHub access issue | Check repo permissions | +| `Container exited with code 1` | App crash | Check `docker logs application-server` | +| `proxy-agent or application-server is not started` | Container never started | Check earlier logs | + +--- + +## When to Use Features + +Sometimes you need the full-featured approach: + +| Need | Solution | +|------|----------| +| Workbench CLI (`wb`) | Use `workbench-tools` feature | +| LLM/MCP integration | Use `wb-mcp-server` feature | +| Pre-authenticated gcloud | Use `workbench-tools` feature | + +**If you need these, use the full `workbench-app-devcontainers` repo as your base.** diff --git a/features/src/llm-context/skills/DASHBOARD_BUILDER.md b/features/src/llm-context/skills/DASHBOARD_BUILDER.md new file mode 100644 index 000000000..9c5f19f13 --- /dev/null +++ b/features/src/llm-context/skills/DASHBOARD_BUILDER.md @@ -0,0 +1,678 @@ +# Web Apps & Dashboards Skill + +**Build interactive web apps, dashboards, and visualizations that run on a port in Workbench.** + +> **Triggers:** +> - "Create a dashboard", "visualize data", "build charts" +> - "Run a Flask/Streamlit/FastAPI app" +> - "Display data in the browser", "interactive UI" +> - Any web app that serves content on a port + +--- + +## 🌐 Workbench Proxy & Web Apps Best Practices + +### Proxy URL Format + +All web apps in Workbench are accessed via: +``` +https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] +``` + +### ⚠️ How to Get the App UUID (CRITICAL) + +**You MUST automatically get the app UUID - NEVER ask the user for it.** + +```bash +# Run this command and use the output: +wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 +``` + +**⚡ LLM INSTRUCTION:** When constructing dashboard/proxy URLs: +1. First run the command above to get the running app UUID +2. Use that actual UUID in the URL you provide +3. Do NOT use placeholders like `[APP_UUID]` in your final response +4. Do NOT ask the user to find/replace the UUID themselves + +### ✅ Correct URL Examples +``` +https://workbench.verily.com/app/abc123-def456-789/proxy/8080/ +https://workbench.verily.com/app/abc123-def456-789/proxy/8501/index.html +https://workbench.verily.com/app/abc123-def456-789/proxy/8000/dashboard.html +``` + +### ❌ WRONG URL Formats (These WILL fail) +``` +https://abc123-def456.workbench-app.verily.com/ ← WRONG: "Bad Request" error +https://workbench-app.verily.com/abc123-def456/ ← WRONG: Invalid domain +http://localhost:8080/ ← WRONG: Not accessible externally +https://abc123-def456/workbench.verily.com/ ← WRONG: Reversed format +file:///home/jupyter/dashboard.html ← WRONG: JavaScript blocked +``` + +### ⚠️ Common Issue: JavaScript API Calls Failing + +**Problem:** JavaScript using absolute paths fails through Workbench proxy + +**Symptoms:** +- Dashboard loads but shows no data +- Charts remain empty with "-" placeholders +- Browser console shows 404 errors for API calls +- Flask/server logs show requests for `/` but NOT `/api/*` endpoints + +### ✅ Solution: Use Relative Paths (TESTED & CONFIRMED) + +**Always use relative paths (no leading `/`) for fetch/AJAX calls:** + +```javascript +// ✅ CORRECT - relative paths work through proxy +fetch('api/metadata') +fetch('api/data?filter=value') + +// ❌ WRONG - absolute paths fail +fetch('/api/metadata') +fetch('/api/data?filter=value') +``` + +### Why Absolute Paths Fail + +``` +User visits: https://workbench.verily.com/app/UUID/proxy/8080/ + +Absolute path: fetch('/api/data') + → Browser resolves to: https://workbench.verily.com/api/data ❌ (404!) + +Relative path: fetch('api/data') + → Browser resolves to: https://workbench.verily.com/app/UUID/proxy/8080/api/data ✅ +``` + +### Alternative: Embed Data in HTML (For Static Dashboards) + +If you don't need dynamic filtering, embed data directly in the template: + +**Python (Flask):** +```python +@app.route('/') +def index(): + data = get_data_from_bigquery() + return render_template('dashboard.html', data_json=json.dumps(data)) +``` + +**HTML Template:** +```html + +``` + +**When to use:** Static dashboards, large datasets that don't change, or when filters can be client-side only. + +### Testing Checklist + +Before deploying any web app: + +- [ ] **Relative paths** - All `fetch()` calls use `'api/...'` not `'/api/...'` +- [ ] **Test locally** - `curl http://localhost:PORT/api/endpoint` returns data +- [ ] **Server logs** - Verify API requests arrive: `tail -f server.log` +- [ ] **Browser DevTools** - Network tab shows 200 status for API calls +- [ ] **App UUID obtained** - Not using placeholder `[APP_UUID]` + +--- + +## Workflow + +### Step 1: Understand Requirements + +Ask the user: +1. **Data source?** BigQuery table, CSV in bucket, or local file? +2. **Visualizations?** Charts (bar, line, scatter), tables, filters? +3. **Interactivity?** Static display or dynamic filtering? + +### Step 2: Auto-Detect Environment + +**Always run these commands first:** + +```bash +# Get app UUID (REQUIRED for final URL) +APP_UUID=$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) +echo "App UUID: $APP_UUID" + +# Verify Python +python3 --version + +# Check working directory +pwd +``` + +### Step 3: Install Dependencies + +```bash +pip install flask flask-cors pandas plotly google-cloud-bigquery db-dtypes +``` + +> **Note:** `db-dtypes` is required for BigQuery to properly convert data types for pandas. + +### Step 4: Create Dashboard Structure + +``` +dashboard/ +├── app.py # Flask server +├── templates/ +│ └── index.html # Dashboard HTML +└── static/ + └── style.css # Optional styling +``` + +--- + +## Working Templates + +### Template 1: Simple BigQuery Dashboard + +**app.py:** +```python +from flask import Flask, render_template, jsonify +from flask_cors import CORS +from google.cloud import bigquery +import os + +app = Flask(__name__) +CORS(app) + +# Cache for data +_data_cache = None + +def get_bigquery_data(): + global _data_cache + if _data_cache is not None: + return _data_cache + + client = bigquery.Client() + query = """ + SELECT * + FROM `YOUR_PROJECT.YOUR_DATASET.YOUR_TABLE` + LIMIT 1000 + """ + df = client.query(query).to_dataframe() + _data_cache = df.to_dict(orient='records') + return _data_cache + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('api/data') # NO leading slash! +def get_data(): + try: + data = get_bigquery_data() + return jsonify(data) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +@app.route('api/metadata') +def get_metadata(): + try: + data = get_bigquery_data() + if data: + return jsonify({ + "columns": list(data[0].keys()), + "row_count": len(data) + }) + return jsonify({"columns": [], "row_count": 0}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +if __name__ == '__main__': + # CRITICAL: host='0.0.0.0' required for Workbench proxy access + app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +**templates/index.html:** +```html + + + + Data Dashboard + + + + +
+

📊 Data Dashboard

+
+

Dataset Info

+
Loading metadata...
+
+
+

Data Visualization

+
Loading chart...
+
+
+

Data Table

+
Loading data...
+
+
+ + + + +``` + +--- + +### Template 2: Multi-Chart Dashboard with Filters + +**app.py additions:** +```python +@app.route('api/data') +def get_data(): + # Get filter parameters + column = request.args.get('filter_column') + value = request.args.get('filter_value') + + data = get_bigquery_data() + + if column and value: + data = [row for row in data if str(row.get(column, '')) == value] + + return jsonify(data) + +@app.route('api/filters') +def get_filters(): + data = get_bigquery_data() + if not data: + return jsonify({}) + + # Get unique values for categorical columns + filters = {} + for col in data[0].keys(): + unique_values = list(set(str(row[col]) for row in data)) + if len(unique_values) < 50: # Only include if reasonable number + filters[col] = sorted(unique_values) + + return jsonify(filters) +``` + +**JavaScript filter implementation:** +```javascript +async function loadFilters() { + const response = await fetch('api/filters'); + const filters = await response.json(); + + const filterContainer = document.getElementById('filters'); + for (const [column, values] of Object.entries(filters)) { + const select = document.createElement('select'); + select.id = `filter-${column}`; + select.innerHTML = `` + + values.map(v => ``).join(''); + select.onchange = () => refreshData(); + + filterContainer.appendChild(document.createTextNode(column + ': ')); + filterContainer.appendChild(select); + } +} + +async function refreshData() { + const params = new URLSearchParams(); + document.querySelectorAll('select[id^="filter-"]').forEach(select => { + if (select.value) { + params.set('filter_column', select.id.replace('filter-', '')); + params.set('filter_value', select.value); + } + }); + + const response = await fetch(`api/data?${params}`); // Still relative! + const data = await response.json(); + updateCharts(data); +} +``` + +--- + +## Step 5: Test Locally + +**Before starting the server, test your setup:** + +```bash +# Start server in background +cd dashboard +python3 app.py & +sleep 2 + +# Test endpoints locally +echo "Testing root..." +curl -s http://localhost:8080/ | head -5 + +echo "Testing API..." +curl -s http://localhost:8080/api/metadata | jq . + +echo "Testing data..." +curl -s http://localhost:8080/api/data | jq '.[0]' +``` + +--- + +## Step 6: Start Server & Provide URL + +```bash +# Get the app UUID +APP_UUID=$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) + +# Start server +cd dashboard +nohup python3 app.py > server.log 2>&1 & + +echo "Dashboard running at:" +echo "https://workbench.verily.com/app/${APP_UUID}/proxy/8080/" +``` + +**Always provide the complete, working URL to the user - never placeholders!** + +--- + +## ⚠️ Critical Flask Server Configuration + +These settings are **REQUIRED** for Workbench dashboards to work: + +### 1. Server MUST bind to 0.0.0.0 (NOT localhost) + +```python +# ❌ WRONG - proxy cannot reach your app +app.run(host='localhost', port=8080) +app.run(host='127.0.0.1', port=8080) + +# ✅ CORRECT - accessible through Workbench proxy +app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +**Why:** The Workbench proxy routes external requests to your app. If bound to localhost, the proxy cannot reach it. + +### 2. Enable Threading for Concurrent Users + +```python +app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +**Why:** Multiple users may access simultaneously. `threaded=True` allows concurrent request handling. + +### 3. Disable Debug Mode + +```python +# ❌ WRONG - security risk, auto-reload issues +app.run(debug=True) + +# ✅ CORRECT +app.run(debug=False) +``` + +**Why:** Debug mode shouldn't be used in shared/production environments. + +### 4. Restarting Server After Code Changes + +Flask doesn't auto-reload when `debug=False`. After editing Python code: + +```bash +# Find and kill existing server +pkill -f "python3 app.py" +# Or: kill $(lsof -t -i :8080) + +# Restart +python3 app.py & +``` + +### 5. Browser Cache Issues + +If changes don't appear after restarting server: +- **Hard refresh:** `Ctrl+Shift+R` (Windows/Linux) or `Cmd+Shift+R` (Mac) +- Flask caches templates - server restart clears this + +--- + +## Troubleshooting + +### Data doesn't load in browser + +**1. Check paths in JavaScript:** +```javascript +// ❌ WRONG +fetch('/api/data') + +// ✅ CORRECT +fetch('api/data') +``` + +**2. Check server logs:** +```bash +tail -f server.log +# Or if running in foreground, check terminal output +``` + +**3. Test API directly:** +```bash +curl http://localhost:8080/api/data | jq '.[0]' +``` + +**4. Check browser DevTools:** +- Open Network tab +- Look for failed requests (red) +- Check the URL being requested + +### Server won't start + +```bash +# Check if port is in use +lsof -i :8080 + +# Kill existing process +kill $(lsof -t -i :8080) + +# Check Python errors +python3 app.py # Run in foreground to see errors +``` + +### BigQuery errors + +```bash +# Check authentication +gcloud auth list + +# Test BQ access +bq query --use_legacy_sql=false 'SELECT 1' + +# Check project +gcloud config get-value project +``` + +### Server not accessible through proxy (works locally, fails via URL) + +**Symptom:** `curl http://localhost:8080/` works, but Workbench URL fails + +**Cause:** Flask bound to `localhost` instead of `0.0.0.0` + +**Fix:** +```python +# Change this: +app.run(host='localhost', port=8080) +# To this: +app.run(host='0.0.0.0', port=8080) +``` + +### Changes not reflected after editing code + +**Cause 1:** Server not restarted +```bash +pkill -f "python3 app.py" +python3 app.py & +``` + +**Cause 2:** Browser cache +- Hard refresh: `Ctrl+Shift+R` or `Cmd+Shift+R` + +### Gateway timeout + +**Causes:** +1. Server not running: `ps aux | grep app.py` +2. Wrong UUID in URL: `wb app list --format=json` +3. Server bound to localhost (see above) + +--- + +## Development Workflow (Recommended) + +1. **Build and test locally first** + ```bash + curl http://localhost:8080/ + curl http://localhost:8080/api/metadata + ``` + +2. **Check server logs for errors** + ```bash + tail -f server.log + ``` + +3. **Only then test through Workbench proxy URL** + +4. **Use browser DevTools (F12) → Network tab** to debug client-side issues + +--- + +## Common Pitfalls Checklist + +Before declaring the dashboard complete: + +- [ ] **Relative paths** - All `fetch()` calls use `'api/...'` not `'/api/...'` +- [ ] **Host is 0.0.0.0** - Not `localhost` or `127.0.0.1` +- [ ] **threaded=True** - For concurrent users +- [ ] **debug=False** - For security +- [ ] **App UUID obtained** - Not using placeholder `[APP_UUID]` +- [ ] **Server running** - Process is active (`ps aux | grep python`) +- [ ] **Port correct** - URL uses same port as `app.run(port=...)` +- [ ] **CORS enabled** - `CORS(app)` added for cross-origin requests +- [ ] **Data cached** - Avoid repeated BigQuery calls +- [ ] **Error handling** - API returns errors as JSON, not crashes +- [ ] **Tested locally** - `curl` tests pass before giving URL +- [ ] **Server logs checked** - API requests appear in logs + +--- + +## Quick Reference + +| Issue | Check | Fix | +|-------|-------|-----| +| 404 on API | Path format | Remove leading `/` from fetch | +| CORS error | CORS setup | Add `CORS(app)` | +| Blank page | Server running? | `ps aux | grep python` | +| Data error | BigQuery auth | `gcloud auth list` | +| Wrong port | URL vs code | Match port in URL to `app.run()` | +| Works locally, fails via URL | Host binding | Change `localhost` to `0.0.0.0` | +| Gateway timeout | Server/UUID | Check server running + correct UUID | +| Address in use | Port conflict | `kill $(lsof -t -i :8080)` | +| Changes not showing | Cache/restart | Hard refresh + restart server | + +--- + +## Example Prompts This Skill Handles + +- "Create a dashboard showing data from my BigQuery table" +- "Build an interactive chart for analyzing patient demographics" +- "Visualize the CSV files in my bucket" +- "Make a web dashboard with filters for exploring data" +- "Display query results in a browser with charts" diff --git a/features/src/llm-context/skills/DATA_DISCOVERY.md b/features/src/llm-context/skills/DATA_DISCOVERY.md new file mode 100644 index 000000000..aae6f1cbe --- /dev/null +++ b/features/src/llm-context/skills/DATA_DISCOVERY.md @@ -0,0 +1,168 @@ +# Data Collection Discovery + +--- + +## When to Use This Skill + +**Always read this skill before calling `platform_list_data_collections`.** This skill controls the full discovery flow — do not call the MCP tool directly without following these steps first. + +Do NOT read this skill if the user is asking about data already in their workspace. In that case, call `workspace_list_data_collections` directly. + +**Read this skill ONLY when the user says something like:** +- "Search all data collections I have access to" +- "Find data collections across Workbench" +- "What data collections can I add to my workspace?" +- "Are there any data collections I haven't added yet?" +- "Find a data collection related to [topic / disease / modality]" +- "Search across all Workbench data collections for [keyword]" +- "What data collections are available on the platform?" +- "Browse all accessible data collections" + +**Listing data collections in my workspace** — do NOT read this skill, call `workspace_list_data_collections` directly: +- "What data collections are in my workspace?" +- "What data is attached to my workspace?" +- "List the data collections I have" +- "What datasets do I have in this workspace?" +- "Show me the data collections in my workspace" + +--- + +## Step 0 — Clarify the Search Scope + +**If the user's intent is ambiguous** (e.g., they said "find me data" without specifying where), ask: + +> "Would you like me to search only within your active workspace, or search across all data collections you have access to in Workbench (platform-wide)?" + +- **Workspace-only**: Call `workspace_list_data_collections` directly — no need to continue with this skill +- **Platform-wide**: Continue with Steps 1–4 below + +If the user clearly said "in my workspace" or asked about attached resources, skip this skill entirely and call `workspace_list_data_collections` directly. + +--- + +## Step 1 — Clarify Search Criteria + +Before searching, confirm what the user is looking for: + +- **Topic / disease area** (e.g., oncology, cardiovascular, diabetes, general health) +- **Data modality** (e.g., genomics, imaging, lab results, patient-reported outcomes, EHR/EHR-derived) +- **Population** (e.g., age range, geography, study size) +- **Access type** (free vs. controlled access, instantly accessible vs. requires approval) +- **Data model** (e.g., standard underlay like AoU, custom schema) + +If the user has already provided enough context, proceed directly to Step 2. + +--- + +## Step 2 — Search + +### Platform-wide search (primary) + +Use the MCP tool first: + +``` +mcp__wb__platform_list_data_collections(query="") +``` + +- Pass the user's topic, modality, or disease area as `query` +- The tool searches across: name, description, modality tags, therapeutic tags, data model +- If no `query` is provided, it returns all accessible data collections + +If the MCP tool is unavailable, fall back to: +```bash +wb workspace list --format=json | jq '[.[] | select(.properties[]? | select(.key=="terra-type" and .value=="data-collection"))]' +``` + +### Workspace-scoped search + +``` +mcp__wb__workspace_list_data_collections() +``` + +### Search across all returned metadata + +For each result, the tool returns the following fields — use ALL of them when evaluating relevance: + +| Field | What it tells you | +|---|---| +| `name` | Collection name | +| `shortDescription` | One-line summary | +| `description` | Full overview including provenance and methodology | +| `organization` | Who owns the data | +| `availability` | Public open access / Public controlled access / Private | +| `isFree` | Whether access is free | +| `isInstantlyAccessible` | Whether access is immediate or requires approval | +| `patientCount` | Study size | +| `timeFrame` | Date range of data collection | +| `geographicCoverage` | Countries / regions | +| `dataModel` | Schema type (e.g., standard underlay, Non-standard custom) | +| `dataModalityTags` | Types of data (imaging, lab-results, ecrf, genomics, etc.) | +| `therapeuticTags` | Disease/health areas (oncology, general-health, etc.) | +| `underlayName` | Data model identifier — use with `underlay_list_entities` for schema exploration | +| `dataDictionary` | Links to schema documentation | +| `usageExamples` | Sample use cases and SQL queries | +| `accessGroupName` | Access group required | +| `supportEmail` | Who to contact | +| `workbenchUrl` | Direct link to the collection in the Workbench UI | + +--- + +## Step 3 — Rank, Present Results, and Offer to Refine + +For every result returned, assign a **relevance score from 1–5** based on how well the collection's metadata matches the user's query. Use ALL available metadata fields when scoring — name, description, shortDescription, dataModalityTags, therapeuticTags, dataModel, usageExamples, dataDictionary, patientCount, geographicCoverage. + +**Scoring guide:** +| Score | Meaning | +|---|---| +| ⭐⭐⭐⭐⭐ 5 | Exact match — directly contains the data type, gene, disease, or topic the user asked about | +| ⭐⭐⭐⭐ 4 | Strong match — highly relevant to the query and covers the right domain or modality | +| ⭐⭐⭐ 3 | Good match — related to the query's domain; may not be specific to the exact topic but offers valuable context | +| ⭐⭐ 2 | Potential match — shares topical overlap with the query and is worth exploring further | +| ⭐ 1 | Broad match — loosely connected to the query; included for completeness and may surface unexpected value | + +Present results **sorted by score (highest first)**. For each result, include a one-sentence justification for the score that explains concretely why it ranked that way. Example format: + +--- +**[Collection Name]** — ⭐⭐⭐⭐⭐ 5/5 +- **Why**: [One concrete sentence explaining what in the metadata drove this score — e.g. "Contains whole-genome sequencing data with BRCA1/BRCA2 variant calls across 10,000 patients."] +- **Summary**: [shortDescription] +- **Data types**: [dataModalityTags] +- **Patients**: [patientCount] | **Time frame**: [timeFrame] | **Geography**: [geographicCoverage] +- **Access**: [availability] | Free: [isFree] | Instant: [isInstantlyAccessible] +- **View in Workbench**: [workbenchUrl] +--- + +After presenting results, ask: + +> "Do any of these look useful? Would you like to refine the search or explore a specific collection in more detail?" + +If the user wants deeper detail on a specific collection: +- Use `underlayName` with `mcp__wb__underlay_list_entities` to explore the data schema +- Reference `usageExamples` for sample queries +- Reference `dataDictionary` for table/field documentation + +--- + +## Step 4 — Add to Workspace + +If the user wants to use a data collection: + +1. Provide the direct link to the collection: + > "You can view and request access to **[Collection Name]** here: [workbenchUrl]" + +2. Instruct them to click **"Add to Workspace"** or **"Get Access"** in the Workbench UI. The button label depends on whether the collection is instantly accessible or requires approval. + +3. If the collection is instantly accessible (`isInstantlyAccessible: true`), tell them: + > "This collection is instantly accessible — once you click 'Add to Workspace', the resources will be available in your workspace immediately." + +4. If it requires approval (`isInstantlyAccessible: false`): + > "This collection requires access approval. After you submit the request at [workbenchUrl], access is typically granted after review." + +5. After the user confirms they've added the collection, use `workspace_list_data_collections` to confirm the resources are now visible in their workspace. + +--- + +## Notes + +- `workspace_list_data_collections` only shows collections already attached to the active workspace +- `platform_list_data_collections` searches platform-wide but requires the user to have at least READ access to the collection workspace diff --git a/features/src/llm-context/skills/SCIENTIFIC_SKILLS_INDEX.md b/features/src/llm-context/skills/SCIENTIFIC_SKILLS_INDEX.md new file mode 100644 index 000000000..a4225c20a --- /dev/null +++ b/features/src/llm-context/skills/SCIENTIFIC_SKILLS_INDEX.md @@ -0,0 +1,50 @@ +# Scientific Skills Index + +**This file routes Claude to domain-specific scientific skills.** +Workbench skills (workflows, dashboards, custom apps) are handled directly by `CLAUDE.md`. + +--- + +## ⚡ Quick Navigation + +| User Says... | Read This Skill | +|--------------|-----------------| +| "single-cell" / "RNA-seq" / "scanpy" / "differential expression" | `scientific/BIOINFORMATICS.md` | +| "molecule" / "SMILES" / "drug" / "RDKit" / "ChEMBL" / "target" | `scientific/DRUG_DISCOVERY.md` | +| "gene" / "protein" / "variant" / "UniProt" / "Ensembl" / "PDB" | `scientific/GENOMICS_DATABASES.md` | +| "machine learning" / "sklearn" / "statistics" / "plot" | `scientific/DATA_ANALYSIS.md` | +| "clinical trial" / "PubMed" / "survival analysis" | `scientific/CLINICAL.md` | + +--- + +## Domain Skills + +### 🧬 Bioinformatics (`scientific/BIOINFORMATICS.md`) +Single-cell analysis, differential expression, sequence analysis, RNA velocity. +**Packages:** scanpy, anndata, biopython, pydeseq2, scvelo + +### 💊 Drug Discovery (`scientific/DRUG_DISCOVERY.md`) +Cheminformatics, molecular ML, bioactivity databases, target identification. +**Packages/APIs:** rdkit, deepchem, chembl, drugbank, opentargets + +### 🔬 Genomics Databases (`scientific/GENOMICS_DATABASES.md`) +Gene annotations, protein data, variant interpretation, 3D structures. +**APIs:** ensembl, uniprot, clinvar, pdb + +### 📊 Data Analysis (`scientific/DATA_ANALYSIS.md`) +Machine learning, statistics, visualization. +**Packages:** scikit-learn, statsmodels, plotly, seaborn + +### 🏥 Clinical (`scientific/CLINICAL.md`) +Clinical trials, literature search, survival analysis. +**APIs:** clinicaltrials.gov, pubmed + +--- + +## Adding New Skills + +To add skills from [claude-scientific-skills](https://github.com/K-Dense-AI/claude-scientific-skills): + +1. Copy the `SKILL.md` file to `scientific/.md` +2. Add a row to the Quick Navigation table above +3. Add a domain section below diff --git a/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md new file mode 100644 index 000000000..93672acc8 --- /dev/null +++ b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md @@ -0,0 +1,323 @@ +# WDL Workflow Troubleshooting Skill + +**Trigger:** User asks to troubleshoot, debug, or fix a failed workflow. + +## Behavior + +**Once the user confirms which job to investigate, DO NOT ask which diagnostic steps to run.** Instead: +1. **Run all diagnostic commands automatically** (Steps 2–4 at minimum) +2. **Analyze the results** and identify the root cause +3. **Report your diagnosis** with evidence (error messages, exit codes, log snippets) +4. **Propose a fix** with specific changes +5. **THEN ask** if they want you to apply the fix or investigate further + +Don't say: "Would you like me to check the logs?" +Do say: "I checked the logs and found an OOM error. The task requested 8GB but needed more. I recommend increasing memory to 16GB in the runtime block." + +--- + +## Quick Diagnosis (Start Here) + +```bash +# 1. Find failed jobs +wb workflow job list --format=json | jq -r '.[] | select(.status=="FAILED") | "\(.id)\t\(.workflowName)\t\(.startTime)"' + +# 2. Get error message (replace JOB_ID) +wb workflow job describe --job= --format=json | jq -r '.failureMessage // "No message"' + +# 3. Find failed task +wb workflow job task list --job= --format=json | jq -r '.[] | select(.status=="FAILED") | .name' + +# 4. Get task error + logs +wb workflow job task describe --job= --task= --format=json | jq '{stderr, stdout, exitCode, failureMessage}' +``` + +**After running these 4 commands, you'll know:** which job failed, why, which task, and where logs are. + +--- + +## Step-by-Step Guide + +### Step 1: Identify Failed Job + +```bash +# List all failed jobs +wb workflow job list --format=json | jq '.[] | select(.status == "FAILED") | {id, workflowName, status, startTime, endTime}' +``` + +**For batch jobs:** +```bash +# List failed sub-jobs within a batch +wb workflow job batch list --job= --format=json | jq '.[] | select(.status == "FAILED") | {id, status}' +``` + +**Ask user:** Confirm which job ID to investigate (if multiple failed jobs). + +--- + +### Step 2: Get Job Details & Inputs + +```bash +# Full job metadata +wb workflow job describe --job= --format=json +``` + +**Key fields to extract:** +```bash +# Error message +wb workflow job describe --job= --format=json | jq -r '.failureMessage' + +# Inputs used +wb workflow job describe --job= --format=json | jq '.inputs' + +# Outputs (if any) +wb workflow job describe --job= --format=json | jq '.outputs' +``` + +--- + +### Step 3: Find Failed Task & Get Logs + +```bash +# List all tasks with status +wb workflow job task list --job= --format=json | jq '.[] | {name, status, exitCode}' + +# Get failed task details +wb workflow job task describe --job= --task= --format=json +``` + +**Extract log URLs:** +```bash +# Get stderr and stdout URLs +TASK_INFO=$(wb workflow job task describe --job= --task= --format=json) +STDERR_URL=$(echo $TASK_INFO | jq -r '.stderr') +STDOUT_URL=$(echo $TASK_INFO | jq -r '.stdout') + +echo "stderr: $STDERR_URL" +echo "stdout: $STDOUT_URL" +``` + +--- + +### Step 4: Pull and Analyze Task Logs + +#### Read Log Contents + +```bash +# Read stderr (usually contains errors) +gsutil cat "$STDERR_URL" 2>/dev/null | tail -100 + +# Read stdout +gsutil cat "$STDOUT_URL" 2>/dev/null | tail -100 + +# Search for common error patterns +gsutil cat "$STDERR_URL" 2>/dev/null | grep -i -E "error|exception|failed|denied|killed|oom|memory|disk|timeout" | head -30 +``` + +#### Common Log File Patterns + +Cromwell execution logs are typically at: +``` +gs://///execution/ +├── stdout # Task standard output +├── stderr # Task standard error +├── script # The actual command that ran +├── rc # Return code (exit code) +└── script.submit # Submission script +``` + +**One-liner to read all execution files:** +```bash +# Find execution directory from task describe, then: +EXEC_DIR=$(echo $TASK_INFO | jq -r '.executionDirectory // empty') +if [ -n "$EXEC_DIR" ]; then + echo "=== script ===" && gsutil cat "$EXEC_DIR/script" 2>/dev/null + echo "=== rc ===" && gsutil cat "$EXEC_DIR/rc" 2>/dev/null + echo "=== stderr (last 50 lines) ===" && gsutil cat "$EXEC_DIR/stderr" 2>/dev/null | tail -50 +fi +``` + +--- + +### Step 5: Check Resource Allocation & Usage + +#### What Was Requested (from WDL runtime) + +```bash +# Get workflow definition to see runtime requirements +wb workflow describe --workflow= --format=json | jq '.sourceUrl' + +# Read WDL file +gsutil cat gs:////workflow.wdl | grep -A10 "runtime {" +``` + +#### Check Actual Resource Usage (GCP Batch) + +```bash +# For GCP Cromwell jobs, get batch job details +gcloud batch jobs list --filter="status.state=FAILED" --format="table(name,status.state,createTime)" + +# Describe specific batch job +gcloud batch jobs describe --format=json | jq '{ + status: .status.state, + statusEvents: .status.statusEvents, + taskGroups: .taskGroups[0].taskSpec.computeResource +}' +``` + +#### Memory-Specific Checks + +```bash +# Check if OOM (Out of Memory) killed the task +gsutil cat "$STDERR_URL" 2>/dev/null | grep -i -E "oom|out of memory|killed|cannot allocate|memory" + +# Check what memory was requested in batch job +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource.memoryMib' + +# Check dmesg/syslog for OOM events (if available in logs) +gsutil cat "$STDERR_URL" 2>/dev/null | grep -i "killed process" +``` + +--- + +### Step 6: Diagnose by Error Type + +#### Memory Issues (OOM) + +**Symptoms:** +- Exit code 137 (SIGKILL) or 143 +- "Killed" in stderr +- "Cannot allocate memory" +- Task succeeded locally but fails at scale + +**Diagnosis:** +```bash +# Check requested memory +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource' + +# Look for memory errors in logs +gsutil cat "$STDERR_URL" 2>/dev/null | grep -i -E "memory|oom|killed|malloc" +``` + +**Fix:** Increase `memory` in WDL runtime block: +```wdl +runtime { + memory: "32G" # Increase from previous value +} +``` + +#### Disk Issues + +**Symptoms:** +- "No space left on device" +- "Disk quota exceeded" + +**Diagnosis:** +```bash +gsutil cat "$STDERR_URL" 2>/dev/null | grep -i -E "space|disk|quota" +``` + +**Fix:** Increase disk in WDL runtime: +```wdl +runtime { + disks: "local-disk 200 SSD" # Increase size +} +``` + +#### Input File Issues + +**Symptoms:** +- "FileNotFoundException" +- "Localization failed" +- File not found errors + +**Diagnosis:** +```bash +# Check if input files exist +wb workflow job describe --job= --format=json | jq -r '.inputs | to_entries[] | .value' | while read path; do + if [[ $path == gs://* ]]; then + echo -n "$path: " && gsutil ls "$path" 2>&1 | head -1 + fi +done +``` + +#### Permission Issues + +**Symptoms:** +- "Permission denied" +- "Access denied" +- 403 errors + +**Diagnosis:** +```bash +# Check service account permissions +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.serviceAccount' + +# Test bucket access +gsutil ls gs:/// 2>&1 | head -5 +``` + +--- + +### Step 7: Propose Solution + +Based on diagnosis, recommend one of: + +| Issue | Solution Template | +|-------|-------------------| +| **OOM** | "Increase memory from X to Y in the runtime block" | +| **Disk full** | "Increase disk size from X to Y GB" | +| **Missing input** | "Input file doesn't exist. Verify path: `gsutil ls `" | +| **Permission** | "Service account lacks access. Grant `roles/storage.objectViewer` on bucket" | +| **Timeout** | "Task exceeded time limit. Increase `maxRetries` or optimize task" | +| **Docker** | "Image pull failed. Verify image exists and is accessible" | +| **Other** | Describe the root cause from logs and propose a fix based on the specific error | + +**Re-run after fixing:** +```bash +wb workflow job run --workflow= --inputs= +``` + +--- + +## Quick Reference + +### Essential Commands + +```bash +# Failed jobs +wb workflow job list --format=json | jq '.[] | select(.status=="FAILED") | {id, workflowName}' + +# Job error +wb workflow job describe --job= --format=json | jq '.failureMessage' + +# Failed tasks +wb workflow job task list --job= --format=json | jq '.[] | select(.status=="FAILED") | .name' + +# Task logs +wb workflow job task describe --job= --task= --format=json | jq '.stderr' | xargs -I{} gsutil cat {} | tail -50 + +# Memory check +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource' +``` + +### Error → Cause → Fix + +| Exit Code | Meaning | Common Fix | +|-----------|---------|------------| +| 1 | General error | Check stderr for details | +| 2 | Misuse of command | Check script syntax | +| 126 | Permission problem | Check file permissions | +| 127 | Command not found | Check PATH, container image | +| 137 | SIGKILL (OOM) | **Increase memory** | +| 139 | Segfault | Check input data, memory | +| 143 | SIGTERM | Task timeout or preemption | + +--- + +## Workbench-Specific Notes + +- **Log retention:** Cromwell logs persist in workspace execution bucket +- **Batch jobs:** Each sub-job has independent logs; troubleshoot specific failed sub-job +- **VPC-SC:** Run `gcloud batch` commands from within workspace app +- **Preemption:** If using spot VMs, set `preemptible: 0` for reliability diff --git a/features/src/llm-context/skills/aws/DASHBOARD_BUILDER.md b/features/src/llm-context/skills/aws/DASHBOARD_BUILDER.md new file mode 100644 index 000000000..08062a495 --- /dev/null +++ b/features/src/llm-context/skills/aws/DASHBOARD_BUILDER.md @@ -0,0 +1,393 @@ +# Web Apps & Dashboards Skill (AWS) + +**Build interactive web apps, dashboards, and visualizations that run on a port in Workbench.** + +> **Triggers:** +> - "Create a dashboard", "visualize data", "build charts" +> - "Run a Flask/Streamlit/FastAPI app" +> - "Display data in the browser", "interactive UI" +> - Any web app that serves content on a port + +--- + +## Workbench Proxy & Web Apps Best Practices + +### Proxy URL Format + +All web apps in Workbench are accessed via: +``` +https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] +``` + +### How to Get the App UUID (CRITICAL) + +**You MUST automatically get the app UUID - NEVER ask the user for it.** + +```bash +wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 +``` + +**LLM INSTRUCTION:** When constructing dashboard/proxy URLs: +1. First run the command above to get the running app UUID +2. Use that actual UUID in the URL you provide +3. Do NOT use placeholders like `[APP_UUID]` in your final response +4. Do NOT ask the user to find/replace the UUID themselves + +### Correct URL Examples +``` +https://workbench.verily.com/app/abc123-def456-789/proxy/8080/ +https://workbench.verily.com/app/abc123-def456-789/proxy/8501/index.html +``` + +### WRONG URL Formats (These WILL fail) +``` +https://abc123-def456.workbench-app.verily.com/ <- WRONG +http://localhost:8080/ <- WRONG: Not accessible externally +``` + +### Common Issue: JavaScript API Calls Failing + +**Problem:** JavaScript using absolute paths fails through Workbench proxy + +**Solution: Use Relative Paths (TESTED & CONFIRMED)** + +```javascript +// CORRECT - relative paths work through proxy +fetch('api/metadata') +fetch('api/data?filter=value') + +// WRONG - absolute paths fail +fetch('/api/metadata') +fetch('/api/data?filter=value') +``` + +--- + +## Workflow + +### Step 1: Understand Requirements + +Ask the user: +1. **Data source?** S3 file (CSV, Parquet, JSON), Athena query, or local file? +2. **Visualizations?** Charts (bar, line, scatter), tables, filters? +3. **Interactivity?** Static display or dynamic filtering? + +### Step 2: Auto-Detect Environment + +```bash +APP_UUID=$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) +echo "App UUID: $APP_UUID" +python3 --version +pwd +``` + +### Step 3: Install Dependencies + +```bash +pip install flask flask-cors pandas plotly boto3 psycopg2-binary +``` + +### Step 4: Create Dashboard Structure + +``` +dashboard/ +├── app.py +├── templates/ +│ └── index.html +└── static/ + └── style.css +``` + +--- + +## Working Templates + +### Template 1: S3 Data Dashboard + +**app.py:** +```python +from flask import Flask, render_template, jsonify +from flask_cors import CORS +import pandas as pd +import boto3 +import os + +app = Flask(__name__) +CORS(app) + +_data_cache = None + +def get_data_from_s3(): + global _data_cache + if _data_cache is not None: + return _data_cache + + # Use the WORKBENCH_ env var set by Workbench + bucket = os.environ.get('WORKBENCH_my_bucket', 'your-bucket-name') + s3 = boto3.client('s3') + obj = s3.get_object(Bucket=bucket, Key='path/to/data.csv') + df = pd.read_csv(obj['Body']) + _data_cache = df.to_dict(orient='records') + return _data_cache + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('api/data') # NO leading slash! +def get_data(): + try: + data = get_data_from_s3() + return jsonify(data) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +@app.route('api/metadata') +def get_metadata(): + try: + data = get_data_from_s3() + if data: + return jsonify({"columns": list(data[0].keys()), "row_count": len(data)}) + return jsonify({"columns": [], "row_count": 0}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +if __name__ == '__main__': + # CRITICAL: host='0.0.0.0' required for Workbench proxy access + app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +### Template 2: Aurora PostgreSQL Dashboard + +Aurora in Workbench uses **IAM database authentication** — you cannot connect with a static +password. The correct flow is: + +1. Get temporary AWS credentials via `wb resource credentials` +2. Generate an IAM auth token via boto3 (token is valid for 15 minutes) +3. Connect with `sslmode='require'` — **SSL is mandatory; connections are rejected without it** + +```python +import json, subprocess, boto3, psycopg2, pandas as pd, os + +def get_aurora_connection(resource_id: str, username: str): + """ + Returns an open psycopg2 connection to a Workbench-managed Aurora database. + resource_id: the Workbench resource ID (e.g. 'test-db-1') + username: the IAM database user (check with your workspace admin) + """ + # Step 1 — get temporary AWS credentials from Workbench + result = subprocess.run( + ['wb', 'resource', 'credentials', + f'--id={resource_id}', '--scope=WRITE_READ', '--format=json'], + capture_output=True, text=True, check=True + ) + creds = json.loads(result.stdout) + + # Step 2 — parse connection details from WORKBENCH_* env var + # Format: "host:port/dbname" e.g. "abc.cluster.us-west-2.rds.amazonaws.com:5432/mydb" + conn_str = os.environ.get(f'WORKBENCH_{resource_id.replace("-", "_")}', '') + host_part, _, dbname = conn_str.partition('/') + host, _, port = host_part.partition(':') + port = int(port) if port else 5432 + + # Step 3 — generate IAM auth token (valid 15 min) + session = boto3.Session( + aws_access_key_id=creds['AccessKeyId'], + aws_secret_access_key=creds['SecretAccessKey'], + aws_session_token=creds['SessionToken'], + region_name='us-west-2' + ) + auth_token = session.client('rds').generate_db_auth_token( + DBHostname=host, Port=port, DBUsername=username, Region='us-west-2' + ) + + # Step 4 — connect with SSL (REQUIRED — Aurora rejects unencrypted connections) + return psycopg2.connect( + host=host, port=port, database=dbname, + user=username, password=auth_token, + sslmode='require' # mandatory — omitting this causes "PAM authentication failed" + ) + +def get_data_from_aurora(): + global _data_cache + if _data_cache is not None: + return _data_cache + conn = get_aurora_connection('test-db-1', 'your-iam-username') + df = pd.read_sql('SELECT * FROM your_table LIMIT 1000', conn) + conn.close() + _data_cache = df.to_dict(orient='records') + return _data_cache +``` + +> **Why IAM auth?** Workbench-managed Aurora databases are configured for IAM authentication only. +> Static passwords will fail with "PAM authentication failed" or "pg_hba.conf rejects connection". + +### Alternative: Embed Data in HTML (For Static Dashboards) + +```python +import json +@app.route('/') +def index(): + data = get_data_from_s3() + return render_template('dashboard.html', data_json=json.dumps(data)) +``` + +```html + +``` + +--- + +## Troubleshooting + +### No data showing + +**1. Test API directly:** +```bash +curl http://localhost:8080/api/data | python3 -m json.tool | head -20 +``` + +**2. Check S3 access:** +```bash +aws s3 ls s3:///path/to/data.csv +``` + +**3. Check server logs:** +```bash +tail -f server.log +``` + +### Server won't start + +```bash +lsof -i :8080 +kill $(lsof -t -i :8080) +python3 app.py +``` + +### S3 / AWS errors + +```bash +# Check AWS credentials +aws sts get-caller-identity + +# Test S3 access +aws s3 ls s3:/// + +# Check env vars set by Workbench +env | grep WORKBENCH +``` + +### Aurora connection errors + +Aurora requires IAM authentication + SSL. Plain password connections are rejected. + +**Symptoms and causes:** +- `"PAM authentication failed"` -> not using IAM auth token as password +- `"pg_hba.conf rejects connection... no encryption"` -> missing `sslmode='require'` +- `"SSL connection is required"` -> same SSL issue + +**Step-by-step fix:** + +```bash +# 1. Get temporary credentials from Workbench (scoped to this resource) +wb resource credentials --id= --scope=WRITE_READ --format=json +# Returns: {"AccessKeyId":"...","SecretAccessKey":"...","SessionToken":"..."} +``` + +```python +import boto3, psycopg2, json, subprocess + +# 2. Generate IAM auth token +result = subprocess.run( + ['wb', 'resource', 'credentials', '--id=', '--scope=WRITE_READ', '--format=json'], + capture_output=True, text=True, check=True +) +creds = json.loads(result.stdout) + +session = boto3.Session( + aws_access_key_id=creds['AccessKeyId'], + aws_secret_access_key=creds['SecretAccessKey'], + aws_session_token=creds['SessionToken'], + region_name='us-west-2' +) +auth_token = session.client('rds').generate_db_auth_token( + DBHostname='', Port=5432, + DBUsername='', Region='us-west-2' +) + +# 3. Connect with SSL enabled (mandatory) +conn = psycopg2.connect( + host='', port=5432, database='', + user='', password=auth_token, + sslmode='require' # CRITICAL — without this, connection is rejected +) +``` + +**AWS CLI alternative (to verify the token works):** +```bash +# Export the credentials first +export AWS_ACCESS_KEY_ID="..." +export AWS_SECRET_ACCESS_KEY="..." +export AWS_SESSION_TOKEN="..." + +# Generate auth token +TOKEN=$(aws rds generate-db-auth-token \ + --hostname --port 5432 \ + --region us-west-2 --username ) + +# Connect (psql requires SSL flag) +PGSSLMODE=require psql "host= port=5432 dbname= user= password=$TOKEN" +``` + +### Server not accessible through proxy + +**Fix:** Ensure Flask is bound to `0.0.0.0`, not `localhost`: +```python +app.run(host='0.0.0.0', port=8080) +``` + +--- + +## Common Pitfalls Checklist + +- [ ] **Relative paths** - All `fetch()` calls use `'api/...'` not `'/api/...'` +- [ ] **Host is 0.0.0.0** - Not `localhost` or `127.0.0.1` +- [ ] **threaded=True** - For concurrent users +- [ ] **debug=False** - For security +- [ ] **App UUID obtained** - Not using placeholder `[APP_UUID]` +- [ ] **S3 access verified** - `aws s3 ls s3:///` returns files +- [ ] **Data cached** - Avoid repeated S3 reads +- [ ] **Error handling** - API returns errors as JSON, not crashes +- [ ] **CORS enabled** - `CORS(app)` added +- [ ] **Aurora: IAM auth** - Using `wb resource credentials` + boto3 token, not a static password +- [ ] **Aurora: SSL enabled** - `sslmode='require'` in psycopg2.connect() + +--- + +## Quick Reference + +| Issue | Check | Fix | +|-------|-------|-----| +| 404 on API | Path format | Remove leading `/` from fetch | +| CORS error | CORS setup | Add `CORS(app)` | +| Blank page | Server running? | `ps aux \| grep python` | +| S3 error | AWS credentials | `aws sts get-caller-identity` | +| Wrong port | URL vs code | Match port in URL to `app.run()` | +| Works locally, fails via URL | Host binding | Change `localhost` to `0.0.0.0` | +| Gateway timeout | Server/UUID | Check server running + correct UUID | +| Aurora: PAM auth failed | IAM auth | Use `wb resource credentials` + boto3 token | +| Aurora: no encryption | SSL missing | Add `sslmode='require'` to psycopg2.connect() | + +--- + +## Example Prompts This Skill Handles + +- "Create a dashboard showing data from my S3 bucket" +- "Build an interactive chart for analyzing patient demographics" +- "Visualize the CSV files in my bucket" +- "Make a web dashboard with filters for exploring data" +- "Display query results in a browser with charts" diff --git a/features/src/llm-context/skills/aws/WORKFLOW_TROUBLESHOOT.md b/features/src/llm-context/skills/aws/WORKFLOW_TROUBLESHOOT.md new file mode 100644 index 000000000..9befb708a --- /dev/null +++ b/features/src/llm-context/skills/aws/WORKFLOW_TROUBLESHOOT.md @@ -0,0 +1,300 @@ +# WDL Workflow Troubleshooting Skill (AWS) + +**Trigger:** User asks to troubleshoot, debug, or fix a failed workflow. + +## Behavior + +**Once the user confirms which job to investigate, DO NOT ask which diagnostic steps to run.** Instead: +1. **Run all diagnostic commands automatically** (Steps 2–4 at minimum) +2. **Analyze the results** and identify the root cause +3. **Report your diagnosis** with evidence (error messages, exit codes, log snippets) +4. **Propose a fix** with specific changes +5. **THEN ask** if they want you to apply the fix or investigate further + +Don't say: "Would you like me to check the logs?" +Do say: "I checked the logs and found an OOM error. The task requested 8GB but needed more. I recommend increasing memory to 16GB in the runtime block." + +--- + +## Quick Diagnosis (Start Here) + +```bash +# 1. Find failed jobs +wb workflow job list --format=json | jq -r '.[] | select(.status=="FAILED") | "\(.id)\t\(.workflowName)\t\(.startTime)"' + +# 2. Get error message (replace JOB_ID) +wb workflow job describe --job= --format=json | jq -r '.failureMessage // "No message"' + +# 3. Find failed task +wb workflow job task list --job= --format=json | jq -r '.[] | select(.status=="FAILED") | .name' + +# 4. Get task error + logs +wb workflow job task describe --job= --task= --format=json | jq '{stderr, stdout, exitCode, failureMessage}' +``` + +**After running these 4 commands, you'll know:** which job failed, why, which task, and where logs are. + +--- + +## Step-by-Step Guide + +### Step 1: Identify Failed Job + +```bash +wb workflow job list --format=json | jq '.[] | select(.status == "FAILED") | {id, workflowName, status, startTime, endTime}' +``` + +**For batch jobs:** +```bash +wb workflow job batch list --job= --format=json | jq '.[] | select(.status == "FAILED") | {id, status}' +``` + +**Ask user:** Confirm which job ID to investigate (if multiple failed jobs). + +--- + +### Step 2: Get Job Details & Inputs + +```bash +wb workflow job describe --job= --format=json +``` + +**Key fields to extract:** +```bash +wb workflow job describe --job= --format=json | jq -r '.failureMessage' +wb workflow job describe --job= --format=json | jq '.inputs' +wb workflow job describe --job= --format=json | jq '.outputs' +``` + +--- + +### Step 3: Find Failed Task & Get Logs + +```bash +wb workflow job task list --job= --format=json | jq '.[] | {name, status, exitCode}' +wb workflow job task describe --job= --task= --format=json +``` + +**Extract log URLs:** +```bash +TASK_INFO=$(wb workflow job task describe --job= --task= --format=json) +STDERR_URL=$(echo $TASK_INFO | jq -r '.stderr') +STDOUT_URL=$(echo $TASK_INFO | jq -r '.stdout') +echo "stderr: $STDERR_URL" +echo "stdout: $STDOUT_URL" +``` + +--- + +### Step 4: Pull and Analyze Task Logs + +#### Read Log Contents + +```bash +# Read stderr (usually contains errors) — logs are in S3 +aws s3 cp "$STDERR_URL" - 2>/dev/null | tail -100 + +# Read stdout +aws s3 cp "$STDOUT_URL" - 2>/dev/null | tail -100 + +# Search for common error patterns +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "error|exception|failed|denied|killed|oom|memory|disk|timeout" | head -30 +``` + +#### Common Log File Patterns + +Cromwell execution logs are typically at: +``` +s3://///execution/ +├── stdout # Task standard output +├── stderr # Task standard error +├── script # The actual command that ran +├── rc # Return code (exit code) +└── script.submit # Submission script +``` + +**One-liner to read all execution files:** +```bash +EXEC_DIR=$(echo $TASK_INFO | jq -r '.executionDirectory // empty') +if [ -n "$EXEC_DIR" ]; then + echo "=== script ===" && aws s3 cp "$EXEC_DIR/script" - 2>/dev/null + echo "=== rc ===" && aws s3 cp "$EXEC_DIR/rc" - 2>/dev/null + echo "=== stderr (last 50 lines) ===" && aws s3 cp "$EXEC_DIR/stderr" - 2>/dev/null | tail -50 +fi +``` + +--- + +### Step 5: Check Resource Allocation & Usage + +#### What Was Requested (from WDL runtime) + +```bash +wb workflow describe --workflow= --format=json | jq '.sourceUrl' + +# Read WDL file +aws s3 cp s3:////workflow.wdl - | grep -A10 "runtime {" +``` + +#### Check Actual Resource Usage (AWS Batch) + +```bash +# List failed AWS Batch jobs +aws batch list-jobs --job-queue --job-status FAILED \ + --query 'jobSummaryList[*].{id:jobId,name:jobName,status:status}' --output table + +# Describe specific batch job +aws batch describe-jobs --jobs | jq '.jobs[0] | { + status: .status, + statusReason: .statusReason, + container: .container.resourceRequirements +}' +``` + +#### Memory-Specific Checks + +```bash +# Check if OOM killed the task +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "oom|out of memory|killed|cannot allocate|memory" + +# Check what memory was requested in the batch job +aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements[] | select(.type=="MEMORY")' + +# Check for OOM kill signal in stderr +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i "killed process" +``` + +--- + +### Step 6: Diagnose by Error Type + +#### Memory Issues (OOM) + +**Symptoms:** +- Exit code 137 (SIGKILL) or 143 +- "Killed" in stderr +- "Cannot allocate memory" +- Task succeeded locally but fails at scale + +**Diagnosis:** +```bash +aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements' +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "memory|oom|killed|malloc" +``` + +**Fix:** Increase `memory` in WDL runtime block: +```wdl +runtime { + memory: "32G" +} +``` + +#### Disk Issues + +**Symptoms:** +- "No space left on device" +- "Disk quota exceeded" + +**Diagnosis:** +```bash +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "space|disk|quota" +``` + +**Fix:** Increase disk in WDL runtime: +```wdl +runtime { + disks: "local-disk 200 SSD" +} +``` + +#### Input File Issues + +**Symptoms:** +- "FileNotFoundException" +- "Localization failed" +- File not found errors + +**Diagnosis:** +```bash +wb workflow job describe --job= --format=json | jq -r '.inputs | to_entries[] | .value' | while read path; do + if [[ $path == s3://* ]]; then + echo -n "$path: " && aws s3 ls "$path" 2>&1 | head -1 + fi +done +``` + +#### Permission Issues + +**Symptoms:** +- "Permission denied" / "Access denied" / 403 errors + +**Diagnosis:** +```bash +# Check IAM role attached to batch job +aws batch describe-jobs --jobs | jq '.jobs[0].jobDefinition' + +# Test bucket access +aws s3 ls s3:/// 2>&1 | head -5 +``` + +--- + +### Step 7: Propose Solution + +| Issue | Solution Template | +|-------|-------------------| +| **OOM** | "Increase memory from X to Y in the runtime block" | +| **Disk full** | "Increase disk size from X to Y GB" | +| **Missing input** | "Input file doesn't exist. Verify path: `aws s3 ls `" | +| **Permission** | "IAM role lacks S3 access. Grant `s3:GetObject` on the bucket" | +| **Timeout** | "Task exceeded time limit. Increase `maxRetries` or optimize task" | +| **Docker** | "Image pull failed. Verify image exists and is accessible" | +| **Other** | Describe the root cause from logs and propose a fix based on the specific error | + +**Re-run after fixing:** +```bash +wb workflow job run --workflow= --inputs= +``` + +--- + +## Quick Reference + +### Essential Commands + +```bash +# Failed jobs +wb workflow job list --format=json | jq '.[] | select(.status=="FAILED") | {id, workflowName}' + +# Job error +wb workflow job describe --job= --format=json | jq '.failureMessage' + +# Failed tasks +wb workflow job task list --job= --format=json | jq '.[] | select(.status=="FAILED") | .name' + +# Task logs (S3) +wb workflow job task describe --job= --task= --format=json | jq -r '.stderr' | xargs -I{} aws s3 cp {} - | tail -50 + +# Memory check (AWS Batch) +aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements' +``` + +### Error -> Cause -> Fix + +| Exit Code | Meaning | Common Fix | +|-----------|---------|------------| +| 1 | General error | Check stderr for details | +| 2 | Misuse of command | Check script syntax | +| 126 | Permission problem | Check file permissions | +| 127 | Command not found | Check PATH, container image | +| 137 | SIGKILL (OOM) | **Increase memory** | +| 139 | Segfault | Check input data, memory | +| 143 | SIGTERM | Task timeout or preemption | + +--- + +## Workbench-Specific Notes + +- **Log retention:** Cromwell logs persist in workspace execution bucket (S3) +- **Batch jobs:** Each sub-job has independent logs; troubleshoot specific failed sub-job +- **Preemption:** If using spot instances, set `preemptible: 0` for reliability diff --git a/features/src/llm-context/skills/scientific/BIOINFORMATICS.md b/features/src/llm-context/skills/scientific/BIOINFORMATICS.md new file mode 100644 index 000000000..e8ce0da95 --- /dev/null +++ b/features/src/llm-context/skills/scientific/BIOINFORMATICS.md @@ -0,0 +1,212 @@ +# Bioinformatics Skills + +**Trigger:** User asks about single-cell analysis, RNA-seq, sequences, differential expression, or trajectory analysis. + +--- + +## Quick Reference + +| Task | Package | Quick Command | +|------|---------|---------------| +| Single-cell workflow | `scanpy` | `import scanpy as sc; adata = sc.read_h5ad('data.h5ad')` | +| Differential expression | `pydeseq2` | `from pydeseq2 import DeseqDataSet` | +| Sequence analysis | `biopython` | `from Bio import SeqIO` | +| RNA velocity | `scvelo` | `import scvelo as scv` | + +--- + +## Scanpy (Single-Cell Analysis) + +**Use for:** QC, normalization, PCA/UMAP, clustering, marker genes, cell type annotation. + +### Standard Workflow + +```python +import scanpy as sc + +# Load data +adata = sc.read_h5ad('data.h5ad') # or sc.read_10x_mtx('filtered_feature_bc_matrix/') + +# QC +sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True) +adata = adata[adata.obs['total_counts'] > 500] +adata = adata[adata.obs['pct_counts_mt'] < 20] + +# Normalize & log transform +sc.pp.normalize_total(adata, target_sum=1e4) +sc.pp.log1p(adata) + +# Find variable genes +sc.pp.highly_variable_genes(adata, n_top_genes=2000) +adata = adata[:, adata.var.highly_variable] + +# PCA, neighbors, UMAP, clustering +sc.tl.pca(adata) +sc.pp.neighbors(adata, n_pcs=30) +sc.tl.umap(adata) +sc.tl.leiden(adata, resolution=0.5) + +# Marker genes +sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon') +sc.pl.rank_genes_groups(adata, n_genes=10) + +# Visualization +sc.pl.umap(adata, color=['leiden', 'gene_of_interest']) +``` + +### Common File Formats +- `.h5ad` - AnnData format (standard) +- 10X Genomics: `filtered_feature_bc_matrix/` +- CSV: `sc.read_csv('counts.csv')` + +--- + +## AnnData (Data Structure) + +**Use for:** Creating, manipulating, and saving single-cell datasets. + +```python +import anndata as ad +import pandas as pd +import numpy as np + +# Create from scratch +adata = ad.AnnData( + X=count_matrix, # cells x genes + obs=cell_metadata_df, # cell annotations + var=gene_metadata_df # gene annotations +) + +# Key attributes +adata.X # Expression matrix +adata.obs # Cell metadata (DataFrame) +adata.var # Gene metadata (DataFrame) +adata.obsm['X_umap'] # Embeddings +adata.uns # Unstructured data + +# Subset +adata_subset = adata[adata.obs['cell_type'] == 'T cell', :] +adata_subset = adata[:, adata.var['highly_variable']] + +# Save/load +adata.write('output.h5ad') +adata = ad.read_h5ad('output.h5ad') + +# Concatenate datasets +adata_combined = ad.concat([adata1, adata2], join='outer') +``` + +--- + +## PyDESeq2 (Differential Expression) + +**Use for:** Bulk RNA-seq differential expression analysis. + +```python +import pandas as pd +from pydeseq2.dds import DeseqDataSet +from pydeseq2.ds import DeseqStats + +# Load count matrix (genes x samples) and metadata +counts = pd.read_csv('counts.csv', index_col=0) +metadata = pd.read_csv('metadata.csv', index_col=0) + +# Ensure sample order matches +counts = counts[metadata.index] + +# Create DESeq dataset +dds = DeseqDataSet( + counts=counts.T, # samples x genes + metadata=metadata, + design_factors='condition' # column in metadata +) + +# Run DESeq +dds.deseq2() + +# Get results +stat_res = DeseqStats(dds, contrast=['condition', 'treated', 'control']) +stat_res.summary() +results_df = stat_res.results_df + +# Filter significant genes +sig_genes = results_df[(results_df['padj'] < 0.05) & (abs(results_df['log2FoldChange']) > 1)] +``` + +--- + +## Biopython (Sequence Analysis) + +**Use for:** FASTA/GenBank parsing, BLAST, sequence manipulation, NCBI access. + +```python +from Bio import SeqIO, Entrez +from Bio.Seq import Seq + +# Parse FASTA +for record in SeqIO.parse('sequences.fasta', 'fasta'): + print(f"{record.id}: {len(record.seq)} bp") + +# Sequence manipulation +seq = Seq("ATGCGATCGATCG") +print(seq.complement()) +print(seq.reverse_complement()) +print(seq.translate()) + +# NCBI Entrez (always set email) +Entrez.email = "your.email@example.com" +handle = Entrez.efetch(db="nucleotide", id="NM_001301717", rettype="fasta", retmode="text") +record = SeqIO.read(handle, "fasta") + +# BLAST +from Bio.Blast import NCBIWWW, NCBIXML +result_handle = NCBIWWW.qblast("blastn", "nt", seq) +blast_records = NCBIXML.parse(result_handle) +``` + +--- + +## scVelo (RNA Velocity) + +**Use for:** Inferring cell state transitions and trajectory directions. + +```python +import scvelo as scv + +# Load data with spliced/unspliced counts +adata = scv.read('data.h5ad') # or from loom file + +# Preprocessing +scv.pp.filter_and_normalize(adata, min_shared_counts=20) +scv.pp.moments(adata, n_pcs=30, n_neighbors=30) + +# Velocity estimation +scv.tl.velocity(adata) +scv.tl.velocity_graph(adata) + +# Visualization +scv.pl.velocity_embedding_stream(adata, basis='umap') +scv.pl.velocity_embedding(adata, basis='umap', arrow_length=3) + +# Latent time +scv.tl.latent_time(adata) +scv.pl.scatter(adata, color='latent_time', cmap='viridis') + +# Driver genes +scv.tl.rank_velocity_genes(adata, groupby='clusters') +``` + +--- + +## Installation + +```bash +pip install scanpy anndata pydeseq2 biopython scvelo +``` + +--- + +## See Also + +- For interactive visualization → `DATA_ANALYSIS.md` (plotly, seaborn) +- For gene/protein databases → `GENOMICS_DATABASES.md` diff --git a/features/src/llm-context/skills/scientific/CLINICAL.md b/features/src/llm-context/skills/scientific/CLINICAL.md new file mode 100644 index 000000000..10efd9cbd --- /dev/null +++ b/features/src/llm-context/skills/scientific/CLINICAL.md @@ -0,0 +1,251 @@ +# Clinical Skills + +**Trigger:** User asks about clinical trials, PubMed, literature search, survival analysis, or patient data. + +--- + +## Quick Reference + +| Task | Source | Access | +|------|--------|--------| +| Clinical trial data | ClinicalTrials.gov | REST API (v2) | +| Literature search | PubMed | E-utilities API | +| Survival analysis | lifelines | Python package | + +--- + +## ClinicalTrials.gov + +**Use for:** Finding trials by condition/drug, trial status, study design, recruiting locations. + +### API v2 Queries + +```python +import requests + +BASE_URL = "https://clinicaltrials.gov/api/v2" + +# Search studies +response = requests.get(f"{BASE_URL}/studies", params={ + "query.cond": "breast cancer", + "query.intr": "pembrolizumab", + "filter.overallStatus": "RECRUITING", + "pageSize": 10 +}) +data = response.json() + +for study in data['studies']: + info = study['protocolSection']['identificationModule'] + status = study['protocolSection']['statusModule'] + print(f"{info['nctId']}: {info['briefTitle']}") + print(f" Status: {status['overallStatus']}") +``` + +### Get Study by NCT ID + +```python +nct_id = "NCT04379596" +response = requests.get(f"{BASE_URL}/studies/{nct_id}") +study = response.json() + +# Key sections +identification = study['protocolSection']['identificationModule'] +status = study['protocolSection']['statusModule'] +design = study['protocolSection']['designModule'] +eligibility = study['protocolSection']['eligibilityModule'] +outcomes = study['protocolSection'].get('outcomesModule', {}) + +print(f"Title: {identification['briefTitle']}") +print(f"Phase: {design.get('phases', ['N/A'])}") +print(f"Enrollment: {design.get('enrollmentInfo', {}).get('count', 'N/A')}") +``` + +### Search Parameters + +| Parameter | Description | Example | +|-----------|-------------|---------| +| `query.cond` | Condition/disease | "lung cancer" | +| `query.intr` | Intervention/drug | "nivolumab" | +| `query.term` | General search | "checkpoint inhibitor" | +| `filter.overallStatus` | Trial status | "RECRUITING", "COMPLETED" | +| `filter.geo` | Location | "distance(39.0,-77.1,50mi)" | +| `filter.advanced` | Phase, age, etc. | "AREA[Phase]PHASE3" | + +--- + +## PubMed (Literature Search) + +**Use for:** Finding papers, abstracts, citation data. + +### E-utilities API + +```python +from Bio import Entrez + +Entrez.email = "your.email@example.com" + +# Search PubMed +handle = Entrez.esearch( + db="pubmed", + term="CRISPR cancer therapy[Title/Abstract] AND 2023[pdat]", + retmax=20 +) +record = Entrez.read(handle) +pmids = record['IdList'] +print(f"Found {record['Count']} articles") + +# Fetch abstracts +handle = Entrez.efetch(db="pubmed", id=pmids, rettype="abstract", retmode="text") +abstracts = handle.read() +print(abstracts) + +# Fetch structured data +handle = Entrez.efetch(db="pubmed", id=pmids[:5], rettype="xml", retmode="xml") +from Bio import Medline +records = Medline.parse(handle) +for record in records: + print(f"Title: {record.get('TI', 'N/A')}") + print(f"Authors: {', '.join(record.get('AU', []))}") + print(f"Journal: {record.get('JT', 'N/A')}") + print() +``` + +### Search Syntax + +| Syntax | Description | Example | +|--------|-------------|---------| +| `[Title]` | Search title only | "cancer[Title]" | +| `[Title/Abstract]` | Title or abstract | "EGFR[Title/Abstract]" | +| `[Author]` | Author name | "Smith J[Author]" | +| `[Journal]` | Journal name | "Nature[Journal]" | +| `[pdat]` | Publication date | "2023[pdat]" | +| `AND`, `OR`, `NOT` | Boolean operators | "cancer AND therapy" | +| `[MeSH Terms]` | MeSH vocabulary | "Neoplasms[MeSH Terms]" | + +### REST API Alternative + +```python +import requests + +# E-utilities via REST +base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" + +# Search +search_url = f"{base_url}/esearch.fcgi" +response = requests.get(search_url, params={ + "db": "pubmed", + "term": "immunotherapy melanoma", + "retmode": "json", + "retmax": 10 +}) +pmids = response.json()['esearchresult']['idlist'] + +# Fetch summaries +summary_url = f"{base_url}/esummary.fcgi" +response = requests.get(summary_url, params={ + "db": "pubmed", + "id": ",".join(pmids), + "retmode": "json" +}) +summaries = response.json()['result'] +``` + +--- + +## Survival Analysis (Lifelines) + +**Use for:** Kaplan-Meier curves, Cox regression, time-to-event analysis. + +### Kaplan-Meier Estimator + +```python +from lifelines import KaplanMeierFitter +import matplotlib.pyplot as plt + +# Data format: duration (time), event (1=occurred, 0=censored) +durations = [5, 6, 6, 2.5, 4, 4, 1, 2, 3, 4, 5, 6] +events = [1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1] + +kmf = KaplanMeierFitter() +kmf.fit(durations, events, label='Overall Survival') + +# Plot survival curve +kmf.plot_survival_function() +plt.xlabel('Time (months)') +plt.ylabel('Survival Probability') +plt.title('Kaplan-Meier Survival Curve') +plt.show() + +# Median survival +print(f"Median survival: {kmf.median_survival_time_}") + +# Survival at specific time +print(f"Survival at 12 months: {kmf.predict(12):.2%}") +``` + +### Compare Groups + +```python +from lifelines.statistics import logrank_test + +# Group 1 +kmf1 = KaplanMeierFitter() +kmf1.fit(durations_group1, events_group1, label='Treatment') + +# Group 2 +kmf2 = KaplanMeierFitter() +kmf2.fit(durations_group2, events_group2, label='Control') + +# Plot both +ax = kmf1.plot_survival_function() +kmf2.plot_survival_function(ax=ax) +plt.show() + +# Log-rank test +results = logrank_test(durations_group1, durations_group2, events_group1, events_group2) +print(f"Log-rank p-value: {results.p_value:.4f}") +``` + +### Cox Proportional Hazards + +```python +from lifelines import CoxPHFitter +import pandas as pd + +# Data with covariates +df = pd.DataFrame({ + 'duration': durations, + 'event': events, + 'age': [45, 50, 55, 60, 48, 52, 58, 62, 49, 51, 53, 57], + 'treatment': [1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0] +}) + +cph = CoxPHFitter() +cph.fit(df, duration_col='duration', event_col='event') + +# Summary with hazard ratios +cph.print_summary() + +# Hazard ratios +print(f"\nHazard Ratios:") +print(cph.hazard_ratios_) + +# Plot coefficients +cph.plot() +plt.show() +``` + +--- + +## Installation + +```bash +pip install biopython requests lifelines matplotlib +``` + +--- + +## See Also + +- For drug/target data → `DRUG_DISCOVERY.md` +- For visualization → `DATA_ANALYSIS.md` diff --git a/features/src/llm-context/skills/scientific/DATA_ANALYSIS.md b/features/src/llm-context/skills/scientific/DATA_ANALYSIS.md new file mode 100644 index 000000000..9c496201c --- /dev/null +++ b/features/src/llm-context/skills/scientific/DATA_ANALYSIS.md @@ -0,0 +1,312 @@ +# Data Analysis Skills + +**Trigger:** User asks about ML, statistics, visualization, plots, sklearn, regression, or classification. + +--- + +## Quick Reference + +| Task | Package | Quick Import | +|------|---------|--------------| +| ML models (classification, regression) | `scikit-learn` | `from sklearn.ensemble import RandomForestClassifier` | +| Statistical tests, regression | `statsmodels` | `import statsmodels.api as sm` | +| Interactive plots | `plotly` | `import plotly.express as px` | +| Statistical visualization | `seaborn` | `import seaborn as sns` | + +--- + +## Scikit-learn (Machine Learning) + +**Use for:** Classification, regression, clustering, dimensionality reduction, model evaluation. + +### Classification + +```python +from sklearn.model_selection import train_test_split, cross_val_score +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import classification_report, confusion_matrix +import pandas as pd + +# Load data +df = pd.read_csv('data.csv') +X = df.drop('target', axis=1) +y = df['target'] + +# Split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Train +model = RandomForestClassifier(n_estimators=100, random_state=42) +model.fit(X_train, y_train) + +# Evaluate +y_pred = model.predict(X_test) +print(classification_report(y_test, y_pred)) +print(confusion_matrix(y_test, y_pred)) + +# Cross-validation +cv_scores = cross_val_score(model, X, y, cv=5) +print(f"CV Accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}") + +# Feature importance +importance = pd.DataFrame({ + 'feature': X.columns, + 'importance': model.feature_importances_ +}).sort_values('importance', ascending=False) +``` + +### Regression + +```python +from sklearn.linear_model import LinearRegression, Ridge, Lasso +from sklearn.metrics import mean_squared_error, r2_score + +model = Ridge(alpha=1.0) +model.fit(X_train, y_train) + +y_pred = model.predict(X_test) +print(f"R²: {r2_score(y_test, y_pred):.3f}") +print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.3f}") +``` + +### Clustering + +```python +from sklearn.cluster import KMeans +from sklearn.preprocessing import StandardScaler + +# Scale features +scaler = StandardScaler() +X_scaled = scaler.fit_transform(X) + +# K-Means +kmeans = KMeans(n_clusters=3, random_state=42) +clusters = kmeans.fit_predict(X_scaled) + +# Evaluate +from sklearn.metrics import silhouette_score +score = silhouette_score(X_scaled, clusters) +print(f"Silhouette Score: {score:.3f}") +``` + +### Dimensionality Reduction + +```python +from sklearn.decomposition import PCA +from sklearn.manifold import TSNE + +# PCA +pca = PCA(n_components=2) +X_pca = pca.fit_transform(X_scaled) +print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2%}") + +# t-SNE +tsne = TSNE(n_components=2, random_state=42) +X_tsne = tsne.fit_transform(X_scaled) +``` + +--- + +## Statsmodels (Statistical Analysis) + +**Use for:** Regression with diagnostics, statistical tests, time series. + +### Linear Regression with Diagnostics + +```python +import statsmodels.api as sm +import pandas as pd + +# Add constant for intercept +X_const = sm.add_constant(X) + +# Fit OLS +model = sm.OLS(y, X_const).fit() + +# Full summary with p-values, R², etc. +print(model.summary()) + +# Key metrics +print(f"R-squared: {model.rsquared:.3f}") +print(f"Adj. R-squared: {model.rsquared_adj:.3f}") +print(f"F-statistic p-value: {model.f_pvalue:.2e}") + +# Coefficients with confidence intervals +print(model.conf_int()) +``` + +### Logistic Regression + +```python +model = sm.Logit(y, X_const).fit() +print(model.summary()) + +# Odds ratios +import numpy as np +odds_ratios = np.exp(model.params) +``` + +### Statistical Tests + +```python +from scipy import stats + +# t-test +t_stat, p_value = stats.ttest_ind(group1, group2) + +# ANOVA +f_stat, p_value = stats.f_oneway(group1, group2, group3) + +# Chi-square test +chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table) + +# Correlation +corr, p_value = stats.pearsonr(x, y) +corr, p_value = stats.spearmanr(x, y) + +# Normality test +stat, p_value = stats.shapiro(data) +``` + +--- + +## Plotly (Interactive Visualization) + +**Use for:** Interactive charts, dashboards, web-embeddable plots. + +### Basic Plots + +```python +import plotly.express as px +import pandas as pd + +df = pd.read_csv('data.csv') + +# Scatter plot +fig = px.scatter(df, x='x', y='y', color='category', + hover_data=['name'], title='Scatter Plot') +fig.show() + +# Bar chart +fig = px.bar(df, x='category', y='value', color='group') +fig.show() + +# Line plot +fig = px.line(df, x='date', y='value', color='series') +fig.show() + +# Histogram +fig = px.histogram(df, x='value', nbins=30, color='group') +fig.show() + +# Box plot +fig = px.box(df, x='category', y='value', color='group') +fig.show() +``` + +### Advanced Features + +```python +import plotly.graph_objects as go + +# Multiple traces +fig = go.Figure() +fig.add_trace(go.Scatter(x=x1, y=y1, name='Series 1')) +fig.add_trace(go.Scatter(x=x2, y=y2, name='Series 2')) +fig.update_layout(title='Multi-series Plot') +fig.show() + +# Heatmap +fig = px.imshow(correlation_matrix, text_auto=True, color_continuous_scale='RdBu_r') +fig.show() + +# 3D scatter +fig = px.scatter_3d(df, x='x', y='y', z='z', color='category') +fig.show() +``` + +--- + +## Seaborn (Statistical Visualization) + +**Use for:** Publication-quality statistical plots with pandas integration. + +### Distribution Plots + +```python +import seaborn as sns +import matplotlib.pyplot as plt + +# Histogram with KDE +sns.histplot(data=df, x='value', hue='group', kde=True) +plt.show() + +# KDE plot +sns.kdeplot(data=df, x='value', hue='group', fill=True) +plt.show() + +# Box plot +sns.boxplot(data=df, x='category', y='value', hue='group') +plt.show() + +# Violin plot +sns.violinplot(data=df, x='category', y='value', hue='group', split=True) +plt.show() +``` + +### Relationship Plots + +```python +# Scatter with regression line +sns.regplot(data=df, x='x', y='y') +plt.show() + +# Joint plot (scatter + marginal distributions) +sns.jointplot(data=df, x='x', y='y', kind='reg') +plt.show() + +# Pair plot (all pairwise relationships) +sns.pairplot(df, hue='category') +plt.show() +``` + +### Heatmaps + +```python +# Correlation heatmap +corr = df.corr() +sns.heatmap(corr, annot=True, cmap='coolwarm', center=0) +plt.show() + +# Clustermap (hierarchical clustering) +sns.clustermap(corr, annot=True, cmap='coolwarm') +plt.show() +``` + +### Styling + +```python +# Set theme +sns.set_theme(style='whitegrid') # darkgrid, white, dark, ticks + +# Figure size +plt.figure(figsize=(10, 6)) + +# Save figure +plt.savefig('plot.png', dpi=300, bbox_inches='tight') +``` + +--- + +## Installation + +```bash +pip install scikit-learn statsmodels plotly seaborn matplotlib pandas +``` + +--- + +## See Also + +- For domain-specific analysis → `BIOINFORMATICS.md`, `DRUG_DISCOVERY.md` +- For dashboards in Workbench → `DASHBOARD_BUILDER.md` diff --git a/features/src/llm-context/skills/scientific/DRUG_DISCOVERY.md b/features/src/llm-context/skills/scientific/DRUG_DISCOVERY.md new file mode 100644 index 000000000..ce1ff3bd4 --- /dev/null +++ b/features/src/llm-context/skills/scientific/DRUG_DISCOVERY.md @@ -0,0 +1,244 @@ +# Drug Discovery Skills + +**Trigger:** User asks about molecules, compounds, drugs, SMILES, fingerprints, ADMET, targets, or bioactivity. + +--- + +## Quick Reference + +| Task | Tool | Quick Access | +|------|------|--------------| +| Molecular structure/properties | `rdkit` | `from rdkit import Chem` | +| ADMET/property prediction | `deepchem` | `import deepchem as dc` | +| Bioactivity data (IC50, Ki) | ChEMBL | REST API | +| Drug info & interactions | DrugBank | REST API | +| Target-disease associations | Open Targets | GraphQL API | + +--- + +## RDKit (Cheminformatics) + +**Use for:** SMILES parsing, molecular descriptors, fingerprints, substructure search, similarity. + +### Basic Operations + +```python +from rdkit import Chem +from rdkit.Chem import Descriptors, AllChem, Draw + +# Parse SMILES +mol = Chem.MolFromSmiles('CC(=O)OC1=CC=CC=C1C(=O)O') # Aspirin +if mol is None: + print("Invalid SMILES") + +# Calculate properties +mw = Descriptors.MolWt(mol) +logp = Descriptors.MolLogP(mol) +hbd = Descriptors.NumHDonors(mol) +hba = Descriptors.NumHAcceptors(mol) +tpsa = Descriptors.TPSA(mol) +rotatable = Descriptors.NumRotatableBonds(mol) + +print(f"MW: {mw:.2f}, LogP: {logp:.2f}, HBD: {hbd}, HBA: {hba}, TPSA: {tpsa:.2f}") + +# Lipinski's Rule of 5 +lipinski_pass = mw <= 500 and logp <= 5 and hbd <= 5 and hba <= 10 +``` + +### Fingerprints & Similarity + +```python +from rdkit import DataStructs +from rdkit.Chem import AllChem + +mol1 = Chem.MolFromSmiles('CCO') +mol2 = Chem.MolFromSmiles('CCCO') + +# Morgan fingerprint (ECFP-like) +fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=2, nBits=2048) +fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=2, nBits=2048) + +# Tanimoto similarity +similarity = DataStructs.TanimotoSimilarity(fp1, fp2) +print(f"Similarity: {similarity:.3f}") +``` + +### Substructure Search + +```python +# Define substructure pattern +pattern = Chem.MolFromSmarts('c1ccccc1') # benzene ring + +# Check if molecule contains pattern +has_benzene = mol.HasSubstructMatch(pattern) + +# Find all matches +matches = mol.GetSubstructMatches(pattern) +``` + +--- + +## DeepChem (Molecular ML) + +**Use for:** Property prediction, ADMET, toxicity, binding affinity. + +```python +import deepchem as dc + +# Load MoleculeNet dataset +tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='ECFP') +train, valid, test = datasets + +# Quick model training +model = dc.models.MultitaskClassifier(n_tasks=1, n_features=1024) +model.fit(train, nb_epoch=10) + +# Predict on new molecules +smiles = ['CCO', 'CC(=O)O', 'c1ccccc1'] +featurizer = dc.feat.CircularFingerprint(size=1024) +features = featurizer.featurize(smiles) +predictions = model.predict_on_batch(features) +``` + +### Pre-trained Models + +```python +# Load pre-trained toxicity model +tox21_tasks, tox21_datasets, tox21_transformers = dc.molnet.load_tox21() + +# ADMET prediction +# Use relevant MoleculeNet datasets: BBBP, ClinTox, SIDER, etc. +``` + +--- + +## ChEMBL Database + +**Use for:** Bioactivity data, IC50/Ki values, target information. + +### REST API Queries + +```python +import requests + +BASE_URL = "https://www.ebi.ac.uk/chembl/api/data" + +# Search compound by name +response = requests.get(f"{BASE_URL}/molecule/search.json?q=aspirin") +results = response.json()['molecules'] + +# Get bioactivity for a target (e.g., COX-2) +target_id = "CHEMBL230" # COX-2 +response = requests.get(f"{BASE_URL}/activity.json?target_chembl_id={target_id}&limit=100") +activities = response.json()['activities'] + +for act in activities[:5]: + print(f"{act['molecule_chembl_id']}: {act['standard_type']} = {act['standard_value']} {act['standard_units']}") +``` + +### Using chembl_webresource_client + +```python +from chembl_webresource_client.new_client import new_client + +# Search molecules +molecule = new_client.molecule +aspirin = molecule.filter(pref_name__iexact='aspirin')[0] + +# Get activities for target +activity = new_client.activity +target_activities = activity.filter(target_chembl_id='CHEMBL230', pchembl_value__gte=6) + +# Search by SMILES similarity +similarity = new_client.similarity +similar_mols = similarity.filter(smiles='CC(=O)Oc1ccccc1C(=O)O', similarity=70) +``` + +--- + +## DrugBank + +**Use for:** Approved drug information, drug-drug interactions, mechanisms. + +```python +import requests + +# Note: DrugBank API requires authentication for full access +# Free tier available at https://go.drugbank.com/ + +# Example: Search drug by name (requires API key) +headers = {'Authorization': 'Bearer YOUR_API_KEY'} +response = requests.get( + 'https://api.drugbank.com/v1/drugs', + params={'q': 'metformin'}, + headers=headers +) +``` + +### DrugBank Data Fields +- Drug name, description, indication +- Mechanism of action +- Drug-drug interactions +- Targets and enzymes +- ADMET properties +- Chemical structure (SMILES, InChI) + +--- + +## Open Targets + +**Use for:** Target-disease associations, genetic evidence, known drugs. + +### GraphQL API + +```python +import requests + +ENDPOINT = "https://api.platform.opentargets.org/api/v4/graphql" + +# Query target-disease associations +query = """ +query targetAssociations($ensemblId: String!) { + target(ensemblId: $ensemblId) { + id + approvedSymbol + associatedDiseases { + rows { + disease { id name } + score + } + } + } +} +""" + +response = requests.post(ENDPOINT, json={ + 'query': query, + 'variables': {'ensemblId': 'ENSG00000157764'} # BRAF +}) +data = response.json()['data']['target'] + +for assoc in data['associatedDiseases']['rows'][:5]: + print(f"{assoc['disease']['name']}: {assoc['score']:.3f}") +``` + +### Common Queries +- Target tractability and safety +- Known drugs for a disease +- Genetic associations (GWAS) +- Pathway information + +--- + +## Installation + +```bash +pip install rdkit deepchem chembl_webresource_client requests +``` + +--- + +## See Also + +- For protein structures → `GENOMICS_DATABASES.md` (PDB, UniProt) +- For clinical trials → `CLINICAL.md` diff --git a/features/src/llm-context/skills/scientific/GENOMICS_DATABASES.md b/features/src/llm-context/skills/scientific/GENOMICS_DATABASES.md new file mode 100644 index 000000000..4939f2884 --- /dev/null +++ b/features/src/llm-context/skills/scientific/GENOMICS_DATABASES.md @@ -0,0 +1,251 @@ +# Genomics Databases Skills + +**Trigger:** User asks about genes, proteins, variants, structures, annotations, Ensembl, UniProt, ClinVar, or PDB. + +--- + +## Quick Reference + +| Need | Database | API | +|------|----------|-----| +| Gene annotations, sequences | Ensembl | REST | +| Protein sequences, functions | UniProt | REST | +| Variant clinical significance | ClinVar | E-utilities | +| 3D protein structures | PDB/RCSB | REST | + +--- + +## Ensembl (Gene Annotations) + +**Use for:** Gene lookups, sequences, variant effect prediction (VEP), orthologs. + +### REST API + +```python +import requests + +SERVER = "https://rest.ensembl.org" + +def ensembl_get(endpoint, params=None): + response = requests.get(f"{SERVER}{endpoint}", + headers={"Content-Type": "application/json"}, + params=params) + return response.json() + +# Lookup gene by symbol +gene = ensembl_get("/lookup/symbol/homo_sapiens/BRCA1", {"expand": 1}) +print(f"Gene ID: {gene['id']}, Location: {gene['seq_region_name']}:{gene['start']}-{gene['end']}") + +# Get gene sequence +seq = ensembl_get(f"/sequence/id/{gene['id']}", {"type": "genomic"}) +print(f"Sequence length: {len(seq['seq'])} bp") + +# Variant Effect Predictor (VEP) +vep_result = requests.post( + f"{SERVER}/vep/human/region", + headers={"Content-Type": "application/json"}, + json={"variants": ["17 41234451 . A G . . ."]} # VCF format +).json() +``` + +### Common Endpoints +- `/lookup/symbol/{species}/{symbol}` - Gene by symbol +- `/lookup/id/{id}` - By Ensembl ID +- `/sequence/id/{id}` - Get sequence +- `/homology/id/{id}` - Orthologs/paralogs +- `/vep/{species}/region` - Variant effects + +--- + +## UniProt (Protein Data) + +**Use for:** Protein sequences, functions, domains, GO terms, cross-references. + +### REST API + +```python +import requests + +BASE_URL = "https://rest.uniprot.org/uniprotkb" + +# Search proteins +response = requests.get(f"{BASE_URL}/search", params={ + "query": "gene:TP53 AND organism_id:9606", + "format": "json", + "size": 5 +}) +results = response.json()['results'] + +for entry in results: + print(f"{entry['primaryAccession']}: {entry['proteinDescription']['recommendedName']['fullName']['value']}") + +# Get specific protein +protein = requests.get(f"{BASE_URL}/P04637.json").json() +print(f"Length: {protein['sequence']['length']} aa") + +# Get FASTA sequence +fasta = requests.get(f"{BASE_URL}/P04637.fasta").text + +# ID mapping (convert between databases) +mapping_response = requests.post( + "https://rest.uniprot.org/idmapping/run", + data={"from": "UniProtKB_AC-ID", "to": "Ensembl", "ids": "P04637"} +) +``` + +### Key Fields +- `primaryAccession` - UniProt ID (e.g., P04637) +- `proteinDescription` - Protein name +- `genes` - Gene names +- `sequence` - Amino acid sequence +- `features` - Domains, variants, modifications +- `uniProtKBCrossReferences` - Links to other databases + +--- + +## ClinVar (Variant Clinical Significance) + +**Use for:** Variant pathogenicity, clinical interpretations, disease associations. + +### E-utilities API + +```python +from Bio import Entrez +import xml.etree.ElementTree as ET + +Entrez.email = "your.email@example.com" + +# Search variants by gene +handle = Entrez.esearch(db="clinvar", term="BRCA1[gene] AND pathogenic[clinsig]", retmax=10) +record = Entrez.read(handle) +variant_ids = record['IdList'] + +# Get variant details +for vid in variant_ids[:3]: + handle = Entrez.efetch(db="clinvar", id=vid, rettype="vcv", retmode="xml") + # Parse XML response + print(f"Variant ID: {vid}") +``` + +### Direct REST Query + +```python +import requests + +# Search by gene +response = requests.get( + "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", + params={ + "db": "clinvar", + "term": "BRCA1[gene]", + "retmode": "json", + "retmax": 100 + } +) +ids = response.json()['esearchresult']['idlist'] +``` + +### Clinical Significance Categories +- Pathogenic +- Likely pathogenic +- Uncertain significance (VUS) +- Likely benign +- Benign + +--- + +## PDB/RCSB (Protein Structures) + +**Use for:** 3D structures, structural analysis, drug binding sites. + +### REST API + +```python +import requests + +RCSB_URL = "https://data.rcsb.org/rest/v1/core" +SEARCH_URL = "https://search.rcsb.org/rcsbsearch/v2/query" + +# Get structure metadata +pdb_id = "1TUP" # p53 DNA-binding domain +structure = requests.get(f"{RCSB_URL}/entry/{pdb_id}").json() +print(f"Title: {structure['struct']['title']}") +print(f"Resolution: {structure['rcsb_entry_info'].get('resolution_combined', ['N/A'])} Å") + +# Search structures +search_query = { + "query": { + "type": "terminal", + "service": "full_text", + "parameters": {"value": "kinase inhibitor"} + }, + "return_type": "entry" +} +results = requests.post(SEARCH_URL, json=search_query).json() + +# Download structure file +pdb_file = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb").text +cif_file = requests.get(f"https://files.rcsb.org/download/{pdb_id}.cif").text +``` + +### Working with Structure Files + +```python +from Bio.PDB import PDBParser + +parser = PDBParser() +structure = parser.get_structure("protein", "1TUP.pdb") + +for model in structure: + for chain in model: + print(f"Chain {chain.id}: {len(list(chain.get_residues()))} residues") +``` + +--- + +## Combined Workflow Example + +```python +# Find drug targets for a disease, get protein info, check structures + +import requests + +# 1. Open Targets: Find targets for disease +disease_id = "EFO_0000311" # Cancer +# ... (see DRUG_DISCOVERY.md) + +# 2. UniProt: Get protein details +gene = "EGFR" +uniprot = requests.get( + f"https://rest.uniprot.org/uniprotkb/search", + params={"query": f"gene:{gene} AND organism_id:9606", "format": "json"} +).json()['results'][0] +uniprot_id = uniprot['primaryAccession'] + +# 3. PDB: Find structures +pdb_search = { + "query": { + "type": "terminal", + "service": "text", + "parameters": {"attribute": "rcsb_polymer_entity.pdbx_description", "value": gene} + }, + "return_type": "entry" +} +structures = requests.post("https://search.rcsb.org/rcsbsearch/v2/query", json=pdb_search).json() +print(f"Found {structures['total_count']} structures for {gene}") +``` + +--- + +## Installation + +```bash +pip install biopython requests +``` + +--- + +## See Also + +- For sequence analysis → `BIOINFORMATICS.md` (Biopython) +- For drug-target data → `DRUG_DISCOVERY.md` (ChEMBL, Open Targets) diff --git a/features/src/llm-context/templates/README.md b/features/src/llm-context/templates/README.md new file mode 100644 index 000000000..eaec81d60 --- /dev/null +++ b/features/src/llm-context/templates/README.md @@ -0,0 +1,100 @@ +# Workbench App Templates + +Pre-built application templates for Verily Workbench with workspace resource integration. + +## Available Templates + +| Template | Description | Port | Complexity | +|----------|-------------|------|------------| +| [flask-api](./flask-api/) | REST API with Flask for data processing | 8080 | Simple | +| [streamlit-dashboard](./streamlit-dashboard/) | Interactive data dashboard with Streamlit | 8501 | Simple | +| [rshiny-dashboard](./rshiny-dashboard/) | R-based interactive dashboard with Shiny | 3838 | Simple | +| [file-processor](./file-processor/) | File upload, validation, and GCS storage | 8080 | Simple | + +## Features + +All templates include: + +- ✅ **Workspace Integration**: Auto-discovery of GCS buckets and BigQuery datasets +- ✅ **Environment Variables**: `WORKBENCH_` for all resources +- ✅ **LLM Context**: Compatible with `llm-context` feature for Claude/Gemini +- ✅ **Standard Structure**: Consistent devcontainer configuration +- ✅ **Documentation**: README with usage examples + +## Quick Start + +1. Choose a template that matches your use case +2. Copy the template folder to your repository +3. Customize the application code +4. Deploy to Workbench + +## Template Structure + +Each template follows this structure: + +``` +template-name/ +├── manifest.yaml # Template metadata & capabilities +├── devcontainer-template.json # Workbench UI registration +├── .devcontainer.json # Devcontainer configuration +├── docker-compose.yaml # Container setup +├── Dockerfile # Build instructions +├── app/ # Application code +│ ├── main.py (or app.R) +│ └── requirements.txt +└── README.md # Usage documentation +``` + +## Workspace Resource Access + +### Python + +```python +import os + +# Get all workspace resources +resources = { + k.replace("WORKBENCH_", ""): v + for k, v in os.environ.items() + if k.startswith("WORKBENCH_") +} + +# Access specific resource +bucket_path = os.environ.get("WORKBENCH_my_bucket") +``` + +### R + +```r +# Get all workspace resources +resources <- Sys.getenv() +workbench_vars <- resources[grepl("^WORKBENCH_", names(resources))] + +# Access specific resource +bucket_path <- Sys.getenv("WORKBENCH_my_bucket") +``` + +## Customization + +1. **Add Dependencies**: Edit `requirements.txt` (Python) or `Dockerfile` (R packages) +2. **Change Port**: Update `docker-compose.yaml` and `.devcontainer.json` +3. **Add Features**: Include additional devcontainer features in `.devcontainer.json` + +## Deployment + +### Via Workbench UI + +1. Push your customized template to a GitHub repository +2. In Workbench, create a new app → Custom App +3. Enter repository URL, branch, and folder path +4. Launch the app + +### Template Manifest + +Each template includes a `manifest.yaml` with: +- **capabilities**: What the template can do +- **inputs**: Configuration options +- **complexity**: Simple, Medium, or Advanced +- **port**: Default exposed port + +This manifest can be used by LLMs to select appropriate templates based on user requirements. diff --git a/features/src/llm-context/templates/file-processor/.devcontainer.json b/features/src/llm-context/templates/file-processor/.devcontainer.json new file mode 100644 index 000000000..c3a7c672b --- /dev/null +++ b/features/src/llm-context/templates/file-processor/.devcontainer.json @@ -0,0 +1,13 @@ +{ + "name": "File Processor", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/app", + "remoteUser": "root", + "customizations": { + "workbench": { + "proxyTargetPort": 8080 + } + } +} diff --git a/features/src/llm-context/templates/file-processor/Dockerfile b/features/src/llm-context/templates/file-processor/Dockerfile new file mode 100644 index 000000000..de0660167 --- /dev/null +++ b/features/src/llm-context/templates/file-processor/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.11-slim + +RUN groupadd -r appuser && useradd -r -g appuser -d /home/appuser -m appuser + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl fuse \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Create directories for file processing +RUN mkdir -p /app/uploads /app/processed /app/schemas && \ + chown -R appuser:appuser /app + +COPY app/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY app/ . +RUN chown -R appuser:appuser /app /home/appuser + +EXPOSE 8080 +USER appuser + +CMD ["python", "main.py"] diff --git a/features/src/llm-context/templates/file-processor/README.md b/features/src/llm-context/templates/file-processor/README.md new file mode 100644 index 000000000..7add92064 --- /dev/null +++ b/features/src/llm-context/templates/file-processor/README.md @@ -0,0 +1,64 @@ +# File Processor Template + +A file upload and processing template for Verily Workbench with GCS integration. + +## Features + +- **Drag & Drop Upload**: Easy file upload interface +- **Multi-format Support**: CSV, JSON, Excel files +- **Auto-processing**: Extracts metadata, row counts, column info +- **GCS Integration**: Save processed files to workspace buckets +- **Schema Validation**: Validate JSON against schemas + +## Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/` | GET | Web UI for file upload | +| `/health` | GET | Health check | +| `/buckets` | GET | List workspace buckets | +| `/upload` | POST | Upload and process file | +| `/validate` | POST | Validate file against schema | + +## Supported File Types + +| Type | Extensions | Processing | +|------|------------|------------| +| CSV | `.csv` | Row/column counts, schema, null detection | +| JSON | `.json` | Type detection, key enumeration | +| Excel | `.xlsx`, `.xls` | Row/column counts, schema | + +## Customization + +1. Edit `app/main.py` to add processing logic +2. Update `app/requirements.txt` for additional libraries +3. Add validation schemas to `/app/schemas/` + +## Local Testing + +```bash +cd app && pip install -r requirements.txt && python main.py +``` + +Open http://localhost:8080 in your browser. + +## Workspace Resources + +Workspace buckets are auto-discovered: +- `WORKBENCH_` environment variables +- Displayed in the web UI sidebar +- Used for automatic file storage + +## API Usage + +```bash +# Upload a file +curl -X POST http://localhost:8080/upload \ + -F "file=@data.csv" \ + -F "save_to_gcs=true" + +# Validate JSON against schema +curl -X POST http://localhost:8080/validate \ + -F "file=@data.json" \ + -F 'schema={"type": "object", "required": ["id", "name"]}' +``` diff --git a/features/src/llm-context/templates/file-processor/app/main.py b/features/src/llm-context/templates/file-processor/app/main.py new file mode 100644 index 000000000..660a22622 --- /dev/null +++ b/features/src/llm-context/templates/file-processor/app/main.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +""" +File Processor Template for Verily Workbench + +Upload, validate, transform, and store files with GCS integration. +""" + +import os +import json +import uuid +from datetime import datetime +from pathlib import Path + +from flask import Flask, request, jsonify, render_template_string +from google.cloud import storage +import pandas as pd +from jsonschema import validate, ValidationError + +app = Flask(__name__) + +# Configuration +UPLOAD_FOLDER = Path("/app/uploads") +PROCESSED_FOLDER = Path("/app/processed") +SCHEMAS_FOLDER = Path("/app/schemas") +MAX_CONTENT_LENGTH = 100 * 1024 * 1024 # 100MB + +app.config["MAX_CONTENT_LENGTH"] = MAX_CONTENT_LENGTH + +# ============================================================================= +# HTML TEMPLATE +# ============================================================================= + +HTML_TEMPLATE = """ + + + + File Processor + + + +

📁 File Processor

+

Upload, validate, transform, and store files in your Workbench buckets

+ +
+
+
+

📤 Drag & drop a file here, or click to select

+ +

+
+ + + + + +
+ + + +
+

📦 Available Workspace Buckets

+
Loading...
+
+ + + + +""" + +# ============================================================================= +# WORKSPACE HELPERS +# ============================================================================= + +def get_workspace_buckets(): + """Get GCS bucket paths from workspace environment.""" + return { + k.replace("WORKBENCH_", ""): v + for k, v in os.environ.items() + if k.startswith("WORKBENCH_") and v.startswith("gs://") + } + + +def get_gcs_client(): + return storage.Client() + + +def upload_to_gcs(local_path: Path, bucket_name: str, blob_name: str): + """Upload a file to GCS.""" + client = get_gcs_client() + bucket = client.bucket(bucket_name.replace("gs://", "")) + blob = bucket.blob(blob_name) + blob.upload_from_filename(str(local_path)) + return f"gs://{bucket.name}/{blob_name}" + +# ============================================================================= +# PROCESSING FUNCTIONS +# ============================================================================= + +def process_csv(file_path: Path) -> dict: + """Process and validate CSV file.""" + df = pd.read_csv(file_path) + return { + "rows": len(df), + "columns": len(df.columns), + "column_names": list(df.columns), + "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}, + "null_counts": df.isnull().sum().to_dict(), + "sample": df.head(5).to_dict(orient="records") + } + + +def process_json(file_path: Path) -> dict: + """Process and validate JSON file.""" + with open(file_path) as f: + data = json.load(f) + + if isinstance(data, list): + return { + "type": "array", + "length": len(data), + "sample": data[:5] if len(data) > 5 else data + } + else: + return { + "type": "object", + "keys": list(data.keys()), + "sample": data + } + + +def process_excel(file_path: Path) -> dict: + """Process Excel file.""" + df = pd.read_excel(file_path) + return { + "rows": len(df), + "columns": len(df.columns), + "column_names": list(df.columns), + "sample": df.head(5).to_dict(orient="records") + } + +# ============================================================================= +# ROUTES +# ============================================================================= + +@app.route("/") +def index(): + return render_template_string(HTML_TEMPLATE) + + +@app.route("/health") +def health(): + return jsonify({"status": "healthy"}) + + +@app.route("/buckets") +def list_buckets(): + """List available workspace buckets.""" + return jsonify(get_workspace_buckets()) + + +@app.route("/upload", methods=["POST"]) +def upload_file(): + """Upload and process a file.""" + if "file" not in request.files: + return jsonify({"error": "No file provided"}), 400 + + file = request.files["file"] + if file.filename == "": + return jsonify({"error": "No file selected"}), 400 + + # Save uploaded file + file_id = str(uuid.uuid4())[:8] + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"{timestamp}_{file_id}_{file.filename}" + file_path = UPLOAD_FOLDER / filename + file.save(file_path) + + try: + # Process based on file type + suffix = Path(file.filename).suffix.lower() + + if suffix == ".csv": + result = process_csv(file_path) + elif suffix == ".json": + result = process_json(file_path) + elif suffix in [".xlsx", ".xls"]: + result = process_excel(file_path) + else: + return jsonify({"error": f"Unsupported file type: {suffix}"}), 400 + + result["message"] = f"Successfully processed {file.filename}" + result["filename"] = filename + + # Optionally save to GCS + save_to_gcs = request.form.get("save_to_gcs", "false").lower() == "true" + if save_to_gcs: + buckets = get_workspace_buckets() + if buckets: + # Use first available bucket + bucket_name = list(buckets.values())[0] + blob_name = f"processed/{filename}" + gcs_path = upload_to_gcs(file_path, bucket_name, blob_name) + result["gcs_path"] = gcs_path + else: + result["warning"] = "No GCS buckets found in workspace" + + return jsonify(result) + + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +@app.route("/validate", methods=["POST"]) +def validate_file(): + """Validate file against a JSON schema.""" + if "file" not in request.files: + return jsonify({"error": "No file provided"}), 400 + + if "schema" not in request.form: + return jsonify({"error": "No schema provided"}), 400 + + file = request.files["file"] + schema = json.loads(request.form["schema"]) + + try: + data = json.load(file) + validate(instance=data, schema=schema) + return jsonify({"valid": True, "message": "Validation passed"}) + except ValidationError as e: + return jsonify({"valid": False, "error": str(e.message)}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +# ============================================================================= +# MAIN +# ============================================================================= + +if __name__ == "__main__": + UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) + PROCESSED_FOLDER.mkdir(parents=True, exist_ok=True) + + port = int(os.environ.get("PORT", 8080)) + app.run(host="0.0.0.0", port=port, debug=False) diff --git a/features/src/llm-context/templates/file-processor/app/requirements.txt b/features/src/llm-context/templates/file-processor/app/requirements.txt new file mode 100644 index 000000000..314a27c6e --- /dev/null +++ b/features/src/llm-context/templates/file-processor/app/requirements.txt @@ -0,0 +1,8 @@ +flask==3.0.0 +gunicorn==21.2.0 +google-cloud-storage==2.14.0 +google-cloud-bigquery==3.14.0 +pandas==2.1.4 +jsonschema==4.20.0 +pyarrow==14.0.2 +openpyxl==3.1.2 diff --git a/features/src/llm-context/templates/file-processor/devcontainer-template.json b/features/src/llm-context/templates/file-processor/devcontainer-template.json new file mode 100644 index 000000000..b84fca2bd --- /dev/null +++ b/features/src/llm-context/templates/file-processor/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "file-processor", + "version": "1.0.0", + "name": "File Processor", + "description": "Upload, validate, and transform files with GCS integration", + "documentationURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/master/src/templates/file-processor", + "licenseURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/features/src/llm-context/templates/file-processor/docker-compose.yaml b/features/src/llm-context/templates/file-processor/docker-compose.yaml new file mode 100644 index 000000000..505717e81 --- /dev/null +++ b/features/src/llm-context/templates/file-processor/docker-compose.yaml @@ -0,0 +1,32 @@ +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + volumes: + - .:/workspace:cached + - uploads:/app/uploads + - processed:/app/processed + ports: + - "8080:8080" + environment: + - MAX_UPLOAD_SIZE=100MB + - PYTHONUNBUFFERED=1 + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + +volumes: + uploads: + processed: + +networks: + app-network: + external: true diff --git a/features/src/llm-context/templates/file-processor/manifest.yaml b/features/src/llm-context/templates/file-processor/manifest.yaml new file mode 100644 index 000000000..7246d50a5 --- /dev/null +++ b/features/src/llm-context/templates/file-processor/manifest.yaml @@ -0,0 +1,33 @@ +id: file-processor +name: File Processor +description: Upload, validate, and transform files with GCS integration +version: 1.0.0 + +capabilities: + - file-upload + - file-validation + - data-transformation + - gcs-access + - csv-processing + - json-processing + +inputs: + - name: app_name + type: string + required: true + default: "file-processor" + + - name: validation_schema + type: object + required: false + description: JSON schema for file validation + + - name: output_bucket + type: resource + resource_type: GCS_BUCKET + required: false + description: Bucket to store processed files + +complexity: simple +estimated_build_time: 3min +port: 8080 diff --git a/features/src/llm-context/templates/flask-api/.devcontainer.json b/features/src/llm-context/templates/flask-api/.devcontainer.json new file mode 100644 index 000000000..70b53c427 --- /dev/null +++ b/features/src/llm-context/templates/flask-api/.devcontainer.json @@ -0,0 +1,13 @@ +{ + "name": "Flask API", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/app", + "remoteUser": "root", + "customizations": { + "workbench": { + "proxyTargetPort": 8080 + } + } +} diff --git a/features/src/llm-context/templates/flask-api/Dockerfile b/features/src/llm-context/templates/flask-api/Dockerfile new file mode 100644 index 000000000..5571806ca --- /dev/null +++ b/features/src/llm-context/templates/flask-api/Dockerfile @@ -0,0 +1,32 @@ +FROM python:3.11-slim + +# Create non-root user +RUN groupadd -r appuser && useradd -r -g appuser -d /home/appuser -m appuser + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + fuse \ + && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Copy requirements first for caching +COPY app/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY app/ . + +# Set ownership +RUN chown -R appuser:appuser /app /home/appuser + +# Expose port +EXPOSE 8080 + +# Run as non-root user +USER appuser + +# Start application +CMD ["python", "main.py"] diff --git a/features/src/llm-context/templates/flask-api/README.md b/features/src/llm-context/templates/flask-api/README.md new file mode 100644 index 000000000..6e1b5cf1a --- /dev/null +++ b/features/src/llm-context/templates/flask-api/README.md @@ -0,0 +1,48 @@ +# Flask REST API Template + +A REST API template for Verily Workbench with built-in support for GCS and BigQuery. + +## Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/health` | GET | Health check | +| `/resources` | GET | List workspace resources | +| `/buckets//files` | GET | List files in bucket | +| `/buckets//upload` | POST | Upload file to bucket | +| `/bigquery/query` | POST | Run BigQuery query | +| `/bigquery/tables/` | GET | List tables in dataset | +| `/process` | POST | Custom processing endpoint | + +## Customization + +1. Edit `app/main.py` to add your endpoints +2. Update `app/requirements.txt` for additional dependencies +3. Modify `docker-compose.yaml` for environment variables + +## Local Testing + +```bash +cd app && pip install -r requirements.txt && python main.py +``` + +## Workspace Resources + +Access workspace buckets and datasets via environment variables: +- `WORKBENCH_` contains the resource path +- Use `GET /resources` to see all available resources + +## Example Usage + +```bash +# Check health +curl http://localhost:8080/health + +# List resources +curl http://localhost:8080/resources + +# Query BigQuery +curl -X POST http://localhost:8080/bigquery/query \ + -H "Content-Type: application/json" \ + -d '{"query": "SELECT * FROM `project.dataset.table` LIMIT 10"}' +``` diff --git a/features/src/llm-context/templates/flask-api/app/main.py b/features/src/llm-context/templates/flask-api/app/main.py new file mode 100644 index 000000000..e0a9ab528 --- /dev/null +++ b/features/src/llm-context/templates/flask-api/app/main.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +""" +Flask REST API Template for Verily Workbench + +This template provides a starting point for building REST APIs that +integrate with workspace resources (GCS buckets, BigQuery tables). +""" + +import os +import json +from flask import Flask, request, jsonify +from google.cloud import storage, bigquery + +app = Flask(__name__) + +# ============================================================================= +# WORKSPACE RESOURCE HELPERS +# ============================================================================= + +def get_workspace_resources(): + """ + Get workspace resources from environment variables. + + Workbench automatically sets WORKBENCH_ environment variables + for each resource in the workspace. + """ + resources = {} + for key, value in os.environ.items(): + if key.startswith("WORKBENCH_"): + resource_name = key.replace("WORKBENCH_", "").lower() + resources[resource_name] = value + return resources + + +def get_bucket_client(): + """Get a GCS client for workspace buckets.""" + return storage.Client() + + +def get_bigquery_client(): + """Get a BigQuery client for workspace datasets.""" + return bigquery.Client() + + +# ============================================================================= +# API ENDPOINTS +# ============================================================================= + +@app.route("/health", methods=["GET"]) +def health(): + """Health check endpoint.""" + return jsonify({ + "status": "healthy", + "service": "flask-api" + }) + + +@app.route("/resources", methods=["GET"]) +def list_resources(): + """List all workspace resources available to this app.""" + return jsonify({ + "resources": get_workspace_resources() + }) + + +@app.route("/buckets//files", methods=["GET"]) +def list_bucket_files(bucket_name: str): + """ + List files in a workspace bucket. + + Example: GET /buckets/my-bucket/files + """ + try: + # Remove gs:// prefix if present + bucket_name = bucket_name.replace("gs://", "") + + client = get_bucket_client() + bucket = client.bucket(bucket_name) + + prefix = request.args.get("prefix", "") + blobs = bucket.list_blobs(prefix=prefix) + + files = [{"name": blob.name, "size": blob.size} for blob in blobs] + + return jsonify({ + "bucket": bucket_name, + "files": files, + "count": len(files) + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +@app.route("/buckets//upload", methods=["POST"]) +def upload_file(bucket_name: str): + """ + Upload a file to a workspace bucket. + + Example: POST /buckets/my-bucket/upload + Body: multipart/form-data with 'file' field + """ + try: + if "file" not in request.files: + return jsonify({"error": "No file provided"}), 400 + + file = request.files["file"] + dest_path = request.form.get("path", file.filename) + + bucket_name = bucket_name.replace("gs://", "") + client = get_bucket_client() + bucket = client.bucket(bucket_name) + blob = bucket.blob(dest_path) + + blob.upload_from_file(file) + + return jsonify({ + "success": True, + "path": f"gs://{bucket_name}/{dest_path}" + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +@app.route("/bigquery/query", methods=["POST"]) +def run_query(): + """ + Run a BigQuery query. + + Example: POST /bigquery/query + Body: {"query": "SELECT * FROM `project.dataset.table` LIMIT 10"} + """ + try: + data = request.get_json() + query = data.get("query") + + if not query: + return jsonify({"error": "No query provided"}), 400 + + client = get_bigquery_client() + result = client.query(query).to_dataframe() + + return jsonify({ + "columns": list(result.columns), + "rows": result.to_dict(orient="records"), + "count": len(result) + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +@app.route("/bigquery/tables/", methods=["GET"]) +def list_tables(dataset: str): + """ + List tables in a BigQuery dataset. + + Example: GET /bigquery/tables/my-project.my-dataset + """ + try: + client = get_bigquery_client() + tables = client.list_tables(dataset) + + table_list = [{"table_id": t.table_id, "table_type": t.table_type} for t in tables] + + return jsonify({ + "dataset": dataset, + "tables": table_list, + "count": len(table_list) + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +@app.route("/process", methods=["POST"]) +def process_data(): + """ + Example data processing endpoint. + + Customize this endpoint for your specific use case. + """ + try: + data = request.get_json() + + # TODO: Add your processing logic here + result = { + "input": data, + "processed": True, + "message": "Processing complete" + } + + return jsonify(result) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +# ============================================================================= +# MAIN +# ============================================================================= + +if __name__ == "__main__": + port = int(os.environ.get("PORT", 8080)) + app.run(host="0.0.0.0", port=port, debug=False) diff --git a/features/src/llm-context/templates/flask-api/app/requirements.txt b/features/src/llm-context/templates/flask-api/app/requirements.txt new file mode 100644 index 000000000..f283a3b96 --- /dev/null +++ b/features/src/llm-context/templates/flask-api/app/requirements.txt @@ -0,0 +1,6 @@ +flask==3.0.0 +gunicorn==21.2.0 +google-cloud-storage==2.14.0 +google-cloud-bigquery==3.14.0 +pandas==2.1.4 +pyarrow==14.0.2 diff --git a/features/src/llm-context/templates/flask-api/devcontainer-template.json b/features/src/llm-context/templates/flask-api/devcontainer-template.json new file mode 100644 index 000000000..c82d57371 --- /dev/null +++ b/features/src/llm-context/templates/flask-api/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "flask-api", + "version": "1.0.0", + "name": "Flask REST API", + "description": "REST API with Flask for data processing and backend services", + "documentationURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/master/src/templates/flask-api", + "licenseURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/features/src/llm-context/templates/flask-api/docker-compose.yaml b/features/src/llm-context/templates/flask-api/docker-compose.yaml new file mode 100644 index 000000000..aedcc8ab2 --- /dev/null +++ b/features/src/llm-context/templates/flask-api/docker-compose.yaml @@ -0,0 +1,20 @@ +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + volumes: + - .:/app:cached + ports: + - "8080:8080" + environment: + - FLASK_ENV=production + - FLASK_APP=app/main.py + networks: + - app-network + +networks: + app-network: + external: true diff --git a/features/src/llm-context/templates/flask-api/manifest.yaml b/features/src/llm-context/templates/flask-api/manifest.yaml new file mode 100644 index 000000000..c9003e95c --- /dev/null +++ b/features/src/llm-context/templates/flask-api/manifest.yaml @@ -0,0 +1,41 @@ +id: flask-api +name: Flask REST API +description: REST API with Flask for data processing and backend services +version: 1.0.0 + +capabilities: + - rest-api + - json-processing + - file-upload + - bigquery-access + - gcs-access + - authentication + +inputs: + - name: app_name + type: string + required: true + description: Name of the application + default: "my-api" + + - name: endpoints + type: list + required: false + description: API endpoints to create + default: ["/health", "/process"] + + - name: gcs_buckets + type: list[resource] + resource_type: GCS_BUCKET + required: false + description: GCS buckets to access + + - name: bq_datasets + type: list[resource] + resource_type: BQ_DATASET + required: false + description: BigQuery datasets to access + +complexity: simple +estimated_build_time: 3min +port: 8080 diff --git a/features/src/llm-context/templates/rshiny-dashboard/.devcontainer.json b/features/src/llm-context/templates/rshiny-dashboard/.devcontainer.json new file mode 100644 index 000000000..790c2c976 --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/.devcontainer.json @@ -0,0 +1,13 @@ +{ + "name": "RShiny Dashboard", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/app", + "remoteUser": "root", + "customizations": { + "workbench": { + "proxyTargetPort": 3838 + } + } +} diff --git a/features/src/llm-context/templates/rshiny-dashboard/Dockerfile b/features/src/llm-context/templates/rshiny-dashboard/Dockerfile new file mode 100644 index 000000000..7ff70049a --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/Dockerfile @@ -0,0 +1,40 @@ +FROM rocker/shiny:4.3.2 + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + fuse \ + && rm -rf /var/lib/apt/lists/* + +# Install R packages +RUN R -e "install.packages(c( \ + 'shiny', \ + 'shinydashboard', \ + 'DT', \ + 'plotly', \ + 'ggplot2', \ + 'dplyr', \ + 'tidyr', \ + 'bigrquery', \ + 'googleCloudStorageR' \ +), repos='https://cran.rstudio.com/')" + +# Create app directory +RUN mkdir -p /srv/shiny-server/app + +# Copy application +COPY app/ /srv/shiny-server/ + +# Copy Shiny server config +COPY shiny-server.conf /etc/shiny-server/shiny-server.conf + +# Set permissions +RUN chown -R shiny:shiny /srv/shiny-server + +EXPOSE 3838 + +USER shiny + +CMD ["/usr/bin/shiny-server"] diff --git a/features/src/llm-context/templates/rshiny-dashboard/README.md b/features/src/llm-context/templates/rshiny-dashboard/README.md new file mode 100644 index 000000000..69757368f --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/README.md @@ -0,0 +1,72 @@ +# RShiny Dashboard Template + +An interactive R-based dashboard template for Verily Workbench with Shiny. + +## Features + +- **Data Explorer**: Upload and explore CSV files +- **Visualization**: Create interactive charts with plotly +- **Workspace Resources**: View connected buckets and datasets +- **R Statistical Analysis**: Full R environment for data analysis + +## Tabs + +| Tab | Description | +|-----|-------------| +| Overview | Dashboard summary with resource counts | +| Data Explorer | Upload CSV files, view data tables | +| Visualization | Create scatter, line, bar, histogram charts | +| Resources | View all workspace resources | + +## R Packages Included + +- `shiny` & `shinydashboard` - UI framework +- `DT` - Interactive data tables +- `plotly` & `ggplot2` - Visualization +- `dplyr` & `tidyr` - Data manipulation +- `bigrquery` - BigQuery integration +- `googleCloudStorageR` - GCS integration + +## Customization + +1. Edit `app/app.R` to add new features +2. Modify `Dockerfile` to add R packages +3. Update dashboard layout in the UI section + +## Local Testing + +```bash +R -e "shiny::runApp('app', port=3838)" +``` + +## Workspace Resources + +Access workspace resources via environment variables: +- `WORKBENCH_` contains the resource path +- Use `Sys.getenv()` to access in R code + +## BigQuery Access Example + +```r +library(bigrquery) + +# Run a query +query <- "SELECT * FROM `project.dataset.table` LIMIT 100" +result <- bq_project_query("your-project", query) +df <- bq_table_download(result) +``` + +## GCS Access Example + +```r +library(googleCloudStorageR) + +# Set bucket +gcs_global_bucket("your-bucket-name") + +# List objects +objects <- gcs_list_objects() + +# Download file +gcs_get_object("path/to/file.csv", saveToDisk = "local_file.csv") +``` diff --git a/features/src/llm-context/templates/rshiny-dashboard/app/app.R b/features/src/llm-context/templates/rshiny-dashboard/app/app.R new file mode 100644 index 000000000..8a607b69f --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/app/app.R @@ -0,0 +1,222 @@ +# ============================================================================= +# RShiny Dashboard Template for Verily Workbench +# ============================================================================= + +library(shiny) +library(shinydashboard) +library(DT) +library(plotly) +library(ggplot2) +library(dplyr) + +# ============================================================================= +# WORKSPACE HELPERS +# ============================================================================= + +get_workspace_resources <- function() { + env_vars <- Sys.getenv() + workbench_vars <- env_vars[grepl("^WORKBENCH_", names(env_vars))] + names(workbench_vars) <- gsub("^WORKBENCH_", "", names(workbench_vars)) + as.list(workbench_vars) +} + +# Get workspace resources +resources <- get_workspace_resources() + +# ============================================================================= +# UI +# ============================================================================= + +ui <- dashboardPage( + dashboardHeader(title = "Workbench Dashboard"), + + dashboardSidebar( + sidebarMenu( + menuItem("Overview", tabName = "overview", icon = icon("dashboard")), + menuItem("Data Explorer", tabName = "data", icon = icon("table")), + menuItem("Visualization", tabName = "viz", icon = icon("chart-line")), + menuItem("Resources", tabName = "resources", icon = icon("cloud")) + ) + ), + + dashboardBody( + tabItems( + # Overview Tab + tabItem( + tabName = "overview", + fluidRow( + box( + title = "Welcome to Your Workbench Dashboard", + status = "primary", + solidHeader = TRUE, + width = 12, + p("This RShiny template integrates with your Workbench workspace resources."), + p("Use the sidebar to navigate between data exploration and visualization.") + ) + ), + fluidRow( + valueBoxOutput("resource_count"), + valueBoxOutput("bucket_count"), + valueBoxOutput("dataset_count") + ) + ), + + # Data Explorer Tab + tabItem( + tabName = "data", + fluidRow( + box( + title = "Upload Data", + status = "info", + solidHeader = TRUE, + width = 4, + fileInput("file_upload", "Choose CSV File", accept = ".csv"), + actionButton("load_data", "Load Data", class = "btn-primary") + ), + box( + title = "Data Preview", + status = "success", + solidHeader = TRUE, + width = 8, + DTOutput("data_table") + ) + ) + ), + + # Visualization Tab + tabItem( + tabName = "viz", + fluidRow( + box( + title = "Chart Settings", + status = "warning", + solidHeader = TRUE, + width = 3, + selectInput("x_var", "X Variable", choices = NULL), + selectInput("y_var", "Y Variable", choices = NULL), + selectInput("chart_type", "Chart Type", + choices = c("Scatter", "Line", "Bar", "Histogram")), + actionButton("create_chart", "Create Chart", class = "btn-success") + ), + box( + title = "Chart", + status = "primary", + solidHeader = TRUE, + width = 9, + plotlyOutput("main_chart", height = "500px") + ) + ) + ), + + # Resources Tab + tabItem( + tabName = "resources", + fluidRow( + box( + title = "Workspace Resources", + status = "info", + solidHeader = TRUE, + width = 12, + DTOutput("resources_table") + ) + ) + ) + ) + ) +) + +# ============================================================================= +# SERVER +# ============================================================================= + +server <- function(input, output, session) { + + # Reactive values + data <- reactiveVal(NULL) + + # Load data from file upload + observeEvent(input$load_data, { + req(input$file_upload) + df <- read.csv(input$file_upload$datapath) + data(df) + + # Update variable selectors + updateSelectInput(session, "x_var", choices = names(df)) + updateSelectInput(session, "y_var", choices = names(df)) + }) + + # Data table output + output$data_table <- renderDT({ + req(data()) + datatable(data(), options = list(pageLength = 10, scrollX = TRUE)) + }) + + # Value boxes + output$resource_count <- renderValueBox({ + valueBox( + length(resources), + "Workspace Resources", + icon = icon("folder"), + color = "blue" + ) + }) + + output$bucket_count <- renderValueBox({ + bucket_count <- sum(grepl("^gs://", unlist(resources))) + valueBox( + bucket_count, + "GCS Buckets", + icon = icon("cloud"), + color = "green" + ) + }) + + output$dataset_count <- renderValueBox({ + dataset_count <- sum(grepl("bigquery://", unlist(resources))) + valueBox( + dataset_count, + "BigQuery Datasets", + icon = icon("database"), + color = "purple" + ) + }) + + # Resources table + output$resources_table <- renderDT({ + df <- data.frame( + Name = names(resources), + Path = unlist(resources), + stringsAsFactors = FALSE + ) + datatable(df, options = list(pageLength = 20)) + }) + + # Create chart + observeEvent(input$create_chart, { + req(data(), input$x_var, input$y_var) + + df <- data() + + output$main_chart <- renderPlotly({ + p <- switch( + input$chart_type, + "Scatter" = ggplot(df, aes_string(x = input$x_var, y = input$y_var)) + + geom_point(alpha = 0.6), + "Line" = ggplot(df, aes_string(x = input$x_var, y = input$y_var)) + + geom_line(), + "Bar" = ggplot(df, aes_string(x = input$x_var, y = input$y_var)) + + geom_bar(stat = "identity"), + "Histogram" = ggplot(df, aes_string(x = input$x_var)) + + geom_histogram(bins = 30) + ) + + ggplotly(p + theme_minimal()) + }) + }) +} + +# ============================================================================= +# RUN APP +# ============================================================================= + +shinyApp(ui = ui, server = server) diff --git a/features/src/llm-context/templates/rshiny-dashboard/devcontainer-template.json b/features/src/llm-context/templates/rshiny-dashboard/devcontainer-template.json new file mode 100644 index 000000000..e2947a096 --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "rshiny-dashboard", + "version": "1.0.0", + "name": "RShiny Dashboard", + "description": "Interactive R-based dashboard with Shiny for statistical analysis and visualization", + "documentationURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/master/src/templates/rshiny-dashboard", + "licenseURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/features/src/llm-context/templates/rshiny-dashboard/docker-compose.yaml b/features/src/llm-context/templates/rshiny-dashboard/docker-compose.yaml new file mode 100644 index 000000000..7802142d7 --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/docker-compose.yaml @@ -0,0 +1,29 @@ +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + volumes: + - .:/workspace:cached + - app-data:/home/shiny/data + ports: + - "3838:3838" + environment: + - SHINY_LOG_LEVEL=TRACE + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + +volumes: + app-data: + +networks: + app-network: + external: true diff --git a/features/src/llm-context/templates/rshiny-dashboard/manifest.yaml b/features/src/llm-context/templates/rshiny-dashboard/manifest.yaml new file mode 100644 index 000000000..9d69bfda3 --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/manifest.yaml @@ -0,0 +1,39 @@ +id: rshiny-dashboard +name: RShiny Dashboard +description: Interactive R-based dashboard with Shiny +version: 1.0.0 + +capabilities: + - data-visualization + - interactive-ui + - statistical-analysis + - r-language + - bigquery-access + - gcs-access + +inputs: + - name: app_name + type: string + required: true + default: "my-shiny-app" + + - name: dashboard_title + type: string + required: false + default: "R Shiny Dashboard" + + - name: gcs_buckets + type: list[resource] + resource_type: GCS_BUCKET + required: false + description: GCS buckets to access + + - name: bq_datasets + type: list[resource] + resource_type: BQ_DATASET + required: false + description: BigQuery datasets to access + +complexity: simple +estimated_build_time: 5min +port: 3838 diff --git a/features/src/llm-context/templates/rshiny-dashboard/shiny-server.conf b/features/src/llm-context/templates/rshiny-dashboard/shiny-server.conf new file mode 100644 index 000000000..b09f57a12 --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/shiny-server.conf @@ -0,0 +1,14 @@ +# Define the user we should use when spawning R Shiny processes +run_as shiny; + +# Define a top-level server which will listen on a port +server { + listen 3838; + + # Define the location available at the base URL + location / { + site_dir /srv/shiny-server; + log_dir /var/log/shiny-server; + directory_index on; + } +} diff --git a/features/src/llm-context/templates/streamlit-dashboard/.devcontainer.json b/features/src/llm-context/templates/streamlit-dashboard/.devcontainer.json new file mode 100644 index 000000000..d3b939da0 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/.devcontainer.json @@ -0,0 +1,13 @@ +{ + "name": "Streamlit Dashboard", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/app", + "remoteUser": "root", + "customizations": { + "workbench": { + "proxyTargetPort": 8501 + } + } +} diff --git a/features/src/llm-context/templates/streamlit-dashboard/Dockerfile b/features/src/llm-context/templates/streamlit-dashboard/Dockerfile new file mode 100644 index 000000000..d0fbbb7d0 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +RUN groupadd -r appuser && useradd -r -g appuser -d /home/appuser -m appuser + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl fuse \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY app/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY app/ . +RUN chown -R appuser:appuser /app /home/appuser + +EXPOSE 8501 +USER appuser + +CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"] diff --git a/features/src/llm-context/templates/streamlit-dashboard/README.md b/features/src/llm-context/templates/streamlit-dashboard/README.md new file mode 100644 index 000000000..afe1f5e63 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/README.md @@ -0,0 +1,43 @@ +# Streamlit Dashboard Template + +An interactive data dashboard template for Verily Workbench with GCS and BigQuery integration. + +## Features + +- **GCS File Browser**: Browse and preview files from workspace buckets +- **BigQuery Explorer**: Run SQL queries and view results +- **Data Visualization**: Create charts from uploaded CSV or query results +- **Workspace Resources**: Auto-discovery of workspace buckets and datasets + +## Tabs + +| Tab | Description | +|-----|-------------| +| GCS Files | Browse bucket contents, preview CSV files | +| BigQuery | Run SQL queries, view results in tables | +| Visualize | Create line, bar, or scatter charts | + +## Customization + +1. Edit `app/main.py` to add new visualizations +2. Update `app/requirements.txt` for additional libraries +3. Add new tabs for custom functionality + +## Local Testing + +```bash +cd app && pip install -r requirements.txt && streamlit run main.py +``` + +## Workspace Resources + +Access workspace resources via environment variables: +- `WORKBENCH_` contains the resource path +- Resources are auto-displayed in the sidebar + +## Example Usage + +1. Select a bucket from the sidebar +2. Browse files and preview CSVs +3. Run BigQuery queries in the BigQuery tab +4. Visualize data in the Visualize tab diff --git a/features/src/llm-context/templates/streamlit-dashboard/app/main.py b/features/src/llm-context/templates/streamlit-dashboard/app/main.py new file mode 100644 index 000000000..775a26f65 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/app/main.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +""" +Streamlit Dashboard Template for Verily Workbench + +Interactive data visualization with workspace resource integration. +""" + +import os +import streamlit as st +import pandas as pd +from google.cloud import storage, bigquery + +# ============================================================================= +# PAGE CONFIG +# ============================================================================= + +st.set_page_config( + page_title="Workbench Dashboard", + page_icon="📊", + layout="wide" +) + +# ============================================================================= +# WORKSPACE HELPERS +# ============================================================================= + +@st.cache_resource +def get_gcs_client(): + return storage.Client() + +@st.cache_resource +def get_bq_client(): + return bigquery.Client() + +def get_workspace_resources(): + """Get all WORKBENCH_ environment variables.""" + return { + k.replace("WORKBENCH_", ""): v + for k, v in os.environ.items() + if k.startswith("WORKBENCH_") + } + +# ============================================================================= +# SIDEBAR: RESOURCE BROWSER +# ============================================================================= + +st.sidebar.title("🗂️ Workspace Resources") + +resources = get_workspace_resources() +if resources: + st.sidebar.markdown("**Available Resources:**") + for name, path in resources.items(): + st.sidebar.code(f"{name}: {path}") +else: + st.sidebar.info("No workspace resources found") + +# ============================================================================= +# MAIN CONTENT +# ============================================================================= + +st.title("📊 Data Dashboard") +st.markdown("Interactive data exploration for your Workbench workspace") + +# Tabs for different data sources +tab1, tab2, tab3 = st.tabs(["📁 GCS Files", "📊 BigQuery", "📈 Visualize"]) + +# ----------------------------------------------------------------------------- +# TAB 1: GCS FILE BROWSER +# ----------------------------------------------------------------------------- + +with tab1: + st.header("Cloud Storage Browser") + + # Get buckets from workspace resources + buckets = [v for k, v in resources.items() if v.startswith("gs://")] + + if buckets: + selected_bucket = st.selectbox("Select Bucket", buckets) + + if selected_bucket: + bucket_name = selected_bucket.replace("gs://", "") + + try: + client = get_gcs_client() + bucket = client.bucket(bucket_name) + blobs = list(bucket.list_blobs(max_results=100)) + + if blobs: + files_df = pd.DataFrame([ + {"Name": b.name, "Size (KB)": b.size / 1024, "Updated": b.updated} + for b in blobs + ]) + st.dataframe(files_df, use_container_width=True) + + # File preview + csv_files = [b.name for b in blobs if b.name.endswith('.csv')] + if csv_files: + selected_file = st.selectbox("Preview CSV", csv_files) + if st.button("Load File"): + blob = bucket.blob(selected_file) + data = blob.download_as_text() + df = pd.read_csv(pd.io.common.StringIO(data)) + st.dataframe(df.head(100)) + else: + st.info("Bucket is empty") + except Exception as e: + st.error(f"Error accessing bucket: {e}") + else: + st.info("No GCS buckets found in workspace resources") + +# ----------------------------------------------------------------------------- +# TAB 2: BIGQUERY EXPLORER +# ----------------------------------------------------------------------------- + +with tab2: + st.header("BigQuery Explorer") + + query = st.text_area( + "Enter SQL Query", + value="SELECT * FROM `your-project.your-dataset.your-table` LIMIT 100", + height=150 + ) + + if st.button("Run Query"): + try: + client = get_bq_client() + with st.spinner("Running query..."): + df = client.query(query).to_dataframe() + + st.success(f"Query returned {len(df)} rows") + st.dataframe(df, use_container_width=True) + + # Store in session state for visualization + st.session_state["query_result"] = df + except Exception as e: + st.error(f"Query error: {e}") + +# ----------------------------------------------------------------------------- +# TAB 3: VISUALIZATION +# ----------------------------------------------------------------------------- + +with tab3: + st.header("Data Visualization") + + # File uploader for local CSV + uploaded_file = st.file_uploader("Upload CSV", type=["csv"]) + + if uploaded_file: + df = pd.read_csv(uploaded_file) + st.session_state["viz_data"] = df + + # Use query results or uploaded data + if "viz_data" in st.session_state: + df = st.session_state["viz_data"] + elif "query_result" in st.session_state: + df = st.session_state["query_result"] + else: + st.info("Upload a CSV or run a BigQuery query to visualize data") + st.stop() + + # Column selection + col1, col2 = st.columns(2) + with col1: + x_col = st.selectbox("X Axis", df.columns) + with col2: + y_col = st.selectbox("Y Axis", [c for c in df.columns if c != x_col]) + + chart_type = st.radio("Chart Type", ["Line", "Bar", "Scatter"], horizontal=True) + + # Create chart + if chart_type == "Line": + st.line_chart(df.set_index(x_col)[y_col]) + elif chart_type == "Bar": + st.bar_chart(df.set_index(x_col)[y_col]) + else: + st.scatter_chart(df, x=x_col, y=y_col) + +# ============================================================================= +# FOOTER +# ============================================================================= + +st.markdown("---") +st.caption("Powered by Streamlit | Verily Workbench") diff --git a/features/src/llm-context/templates/streamlit-dashboard/app/requirements.txt b/features/src/llm-context/templates/streamlit-dashboard/app/requirements.txt new file mode 100644 index 000000000..cf28aae9a --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/app/requirements.txt @@ -0,0 +1,7 @@ +streamlit==1.29.0 +google-cloud-storage==2.14.0 +google-cloud-bigquery==3.14.0 +pandas==2.1.4 +plotly==5.18.0 +altair==5.2.0 +pyarrow==14.0.2 diff --git a/features/src/llm-context/templates/streamlit-dashboard/devcontainer-template.json b/features/src/llm-context/templates/streamlit-dashboard/devcontainer-template.json new file mode 100644 index 000000000..6333709c4 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "streamlit-dashboard", + "version": "1.0.0", + "name": "Streamlit Dashboard", + "description": "Interactive data dashboard with Streamlit for visualization and exploration", + "documentationURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/master/src/templates/streamlit-dashboard", + "licenseURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/features/src/llm-context/templates/streamlit-dashboard/docker-compose.yaml b/features/src/llm-context/templates/streamlit-dashboard/docker-compose.yaml new file mode 100644 index 000000000..3aa2a9f61 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/docker-compose.yaml @@ -0,0 +1,21 @@ +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + volumes: + - .:/app:cached + ports: + - "8501:8501" + environment: + - STREAMLIT_SERVER_PORT=8501 + - STREAMLIT_SERVER_ADDRESS=0.0.0.0 + - STREAMLIT_SERVER_HEADLESS=true + networks: + - app-network + +networks: + app-network: + external: true diff --git a/features/src/llm-context/templates/streamlit-dashboard/manifest.yaml b/features/src/llm-context/templates/streamlit-dashboard/manifest.yaml new file mode 100644 index 000000000..cfbc11f86 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/manifest.yaml @@ -0,0 +1,39 @@ +id: streamlit-dashboard +name: Streamlit Dashboard +description: Interactive data dashboard with Streamlit +version: 1.0.0 + +capabilities: + - data-visualization + - interactive-ui + - file-upload + - bigquery-access + - gcs-access + - charts-and-graphs + +inputs: + - name: app_name + type: string + required: true + default: "my-dashboard" + + - name: dashboard_title + type: string + required: false + default: "Data Dashboard" + + - name: gcs_buckets + type: list[resource] + resource_type: GCS_BUCKET + required: false + description: GCS buckets to access + + - name: bq_datasets + type: list[resource] + resource_type: BQ_DATASET + required: false + description: BigQuery datasets to access + +complexity: simple +estimated_build_time: 3min +port: 8501 diff --git a/features/src/llm-context/templates/streamlit-dashboard/startupscript/post-startup.sh b/features/src/llm-context/templates/streamlit-dashboard/startupscript/post-startup.sh new file mode 100755 index 000000000..7376dbedd --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/startupscript/post-startup.sh @@ -0,0 +1,250 @@ +#!/bin/bash + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +echo "=== POST-STARTUP.SH STARTING ===" +echo "Arguments: $*" + +if [[ $# -ne 4 ]]; then + echo "Usage: $0 user workDirectory " + exit 1 +fi + +readonly USER_NAME="${1}" +export USER_NAME +readonly WORK_DIRECTORY="${2}" +export WORK_DIRECTORY +readonly CLOUD="${3}" +export CLOUD +readonly LOG_IN="${4}" +export LOG_IN + +echo "=== VARIABLES SET: USER=${USER_NAME}, WORK_DIR=${WORK_DIRECTORY}, CLOUD=${CLOUD}, LOGIN=${LOG_IN} ===" + +# Gets absolute path of the script directory. +# Because the script sometimes cd to other directoy (e.g. /tmp), +# absolute path is more reliable. +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +readonly SCRIPT_DIR +export SCRIPT_DIR +readonly CLOUD_SCRIPT_DIR="${SCRIPT_DIR}/${CLOUD}" +export CLOUD_SCRIPT_DIR +####################################### +# Emit a message with a timestamp +####################################### +source "${SCRIPT_DIR}/emit.sh" + +source "${CLOUD_SCRIPT_DIR}/vm-metadata.sh" + +readonly RUN_AS_LOGIN_USER="sudo -u ${USER_NAME} bash -l -c" +export RUN_AS_LOGIN_USER + +# Startup script status is propagated out to VM guest attributes +readonly STATUS_ATTRIBUTE="startup_script/status" +export STATUS_ATTRIBUTE +readonly MESSAGE_ATTRIBUTE="startup_script/message" +export MESSAGE_ATTRIBUTE + +USER_PRIMARY_GROUP="$(id --group --name "${USER_NAME}")" +readonly USER_PRIMARY_GROUP +export USER_PRIMARY_GROUP +readonly USER_BASH_COMPLETION_DIR="${WORK_DIRECTORY}/.bash_completion.d" +export USER_BASH_COMPLETION_DIR +readonly USER_HOME_LOCAL_SHARE="${WORK_DIRECTORY}/.local/share" +export USER_HOME_LOCAL_SHARE +readonly USER_WORKBENCH_CONFIG_DIR="${WORK_DIRECTORY}/.workbench" +export USER_WORKBENCH_CONFIG_DIR +readonly USER_WORKBENCH_LEGACY_CONFIG_DIR="${WORK_DIRECTORY}/.terra" +export USER_WORKBENCH_LEGACY_CONFIG_DIR +readonly USER_BASHRC="${WORK_DIRECTORY}/.bashrc" +export USER_BASHRC +readonly USER_BASHENV="${WORK_DIRECTORY}/.bash_env" +export USER_BASHENV +readonly USER_BASH_PROFILE="${WORK_DIRECTORY}/.bash_profile" +export USER_BASH_PROFILE +readonly POST_STARTUP_OUTPUT_FILE="${USER_WORKBENCH_CONFIG_DIR}/post-startup-output.txt" +export POST_STARTUP_OUTPUT_FILE + +# Variables for Workbench-specific code installed on the VM +readonly WORKBENCH_INSTALL_PATH="/usr/bin/wb" +export WORKBENCH_INSTALL_PATH +readonly WORKBENCH_LEGACY_PATH="/usr/bin/terra" +export WORKBENCH_LEGACY_PATH + +# Move to the /tmp directory to let any artifacts left behind by this script can be removed. +cd /tmp || exit + +# Send stdout and stderr from this script to a file for debugging. +# Make the .workbench directory as the user so that they own it and have correct linux permissions. +${RUN_AS_LOGIN_USER} "mkdir -p '${USER_WORKBENCH_CONFIG_DIR}'" +${RUN_AS_LOGIN_USER} "ln -sf '${USER_WORKBENCH_CONFIG_DIR}' '${USER_WORKBENCH_LEGACY_CONFIG_DIR}'" +exec > >(tee -a "${POST_STARTUP_OUTPUT_FILE}") # Append output to the file and print to terminal +exec 2> >(tee -a "${POST_STARTUP_OUTPUT_FILE}" >&2) # Append errors to the file and print to terminal + +# The apt package index may not be clean when we run; resynchronize +echo "=== INSTALLING PACKAGES ===" +if type apk > /dev/null 2>&1; then + echo "=== USING APK PACKAGE MANAGER ===" + apk update + apk add --no-cache jq curl fuse tar wget +elif type apt-get > /dev/null 2>&1; then + echo "=== USING APT PACKAGE MANAGER ===" + apt-get update + apt install -y jq curl fuse tar wget +else + >&2 echo "ERROR: Unable to find a supported package manager" + exit 1 +fi +echo "=== PACKAGES INSTALLED SUCCESSFULLY ===" + + +# Create the target directories for installing into the HOME directory +${RUN_AS_LOGIN_USER} "mkdir -p '${USER_BASH_COMPLETION_DIR}'" +${RUN_AS_LOGIN_USER} "mkdir -p '${USER_HOME_LOCAL_SHARE}'" + +####################################### +# Set guest attributes on GCE. Used here to log completion status of the script. +# See https://cloud.google.com/compute/docs/metadata/manage-guest-attributes +# Arguments: +# $1: The guest attribute domain and key IE startup_script/status +# $2 The data to write to the guest attribute +####################################### +# If the script exits without error let the UI know it completed successfully +# Otherwise if an error occurred write the line and command that failed to guest attributes. +function exit_handler { + local exit_code="${1}" + local line_no="${2}" + local command="${3}" + # Success! Set the guest attributes and exit cleanly + if [[ "${exit_code}" -eq 0 ]]; then + exit 0 + fi + # Write error status and message to guest attributes + set_metadata "${STATUS_ATTRIBUTE}" "ERROR" + set_metadata "${MESSAGE_ATTRIBUTE}" "There was an error in the VM Startup Script on line ${line_no}, command \"${command}\". Please try recreating the VM. See ${POST_STARTUP_OUTPUT_FILE} for more information." + exit "${exit_code}" +} +readonly -f exit_handler +trap 'exit_handler $? $LINENO $BASH_COMMAND' EXIT + +####################################### +# function to retry command +####################################### +function retry() { + local -r max_attempts="$1" + shift + local -r command=("$@") + + local attempt + for ((attempt = 1; attempt < max_attempts; attempt++)); do + # Run the command and return if success + if "${command[@]}"; then + return + fi + + # Sleep a bit in case the problem is a transient network/server issue + if ((attempt < max_attempts)); then + echo "Retrying ${command[*]} in 5 seconds" # send to get_message + sleep 5 + fi + done + + # Execute without the if/then protection such that the exit code propagates + "${command[@]}" +} +readonly -f retry + +# Custom application behavior when opening a terminal window will vary. +# +# Some application that run in custom environments will by default run +# an interactive non-login shell, which sources the ~/.bashrc. +# +# Others will open a login shell, which sources the ~/.bash_profile. +# +# For consistency across these as many environments as possible, this startup +# script writes to ~/.bashrc, and has the ~/.bash_profile source the ~/.bashrc + +cat << EOF >> "${USER_BASH_PROFILE}" + +if [[ -e ~/.bashrc ]]; then + source ~/.bashrc +fi + +EOF +chown "${USER_NAME}:${USER_PRIMARY_GROUP}" "${USER_BASH_PROFILE}" + +# Indicate the start of Workbench customizations of the ~/.bashrc +cat << EOF >> "${USER_BASHRC}" +### BEGIN: Workbench-specific customizations ### + +# Prepend "/usr/bin" (if not already in the path) +if [[ "\${PATH}:" != "/usr/bin:"* ]]; then + export PATH=/usr/bin:\${PATH} +fi + +if [[ -e ~/.bash_env ]]; then + source ~/.bash_env +fi + +EOF + +################################################## +# Set up java which is required for workbench CLI +################################################## +source "${SCRIPT_DIR}/install-java.sh" + +################################### +# Install workbench CLI +################################### +retry 5 "${SCRIPT_DIR}/install-cli.sh" + +################################################## +# Set up user bashrc with workbench customization +################################################## +source "${SCRIPT_DIR}/setup-bashrc.sh" + +################# +# bash completion +################# +source "${SCRIPT_DIR}/bash-completion.sh" + +############### +# git setup +############### +if [[ "${LOG_IN}" == "true" ]]; then + retry 5 "${SCRIPT_DIR}/git-setup.sh" +fi + +############################# +# Mount buckets +############################# + +# Uncomment user_allow_other in the fuse.conf to enable non-root user to mount files with -o allow-other option. +sed -i '/user_allow_other/s/^#//g' /etc/fuse.conf + +source "${CLOUD_SCRIPT_DIR}/resource-mount.sh" + +############################### +# cloud platform specific setup +############################### +if [[ -f "${CLOUD_SCRIPT_DIR}/post-startup-hook.sh" ]]; then + source "${CLOUD_SCRIPT_DIR}/post-startup-hook.sh" +fi + +############################### +# LLM Context Generation +############################### +# Generate context file for LLMs (Claude Code, Gemini, etc.) +# This runs AFTER auth and resource mounting are complete +if [[ -f "/opt/llm-context/generate-context.sh" ]]; then + echo "=== GENERATING LLM CONTEXT ===" + # Run as the login user so files are owned correctly + ${RUN_AS_LOGIN_USER} "/opt/llm-context/generate-context.sh '${WORK_DIRECTORY}'" || { + echo "Warning: LLM context generation failed (non-fatal)" + true # Don't fail the script if context generation fails + } + echo "=== LLM CONTEXT GENERATION COMPLETE ===" +fi diff --git a/features/src/llm-context/templates/streamlit-dashboard/startupscript/remount-on-restart.sh b/features/src/llm-context/templates/streamlit-dashboard/startupscript/remount-on-restart.sh new file mode 100755 index 000000000..4d5cb8676 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/startupscript/remount-on-restart.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# remount-on-restart.sh +# +# Remounts buckets for the logged in user when a devcontainer instance is restarted. + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +if [[ $# -ne 4 ]]; then + echo "Usage: $0 user workDirectory " + exit 1 +fi + +readonly WORKBENCH_INSTALL_PATH="${WORKBENCH_INSTALL_PATH:-/usr/bin/wb}" + +readonly USER_NAME="${1}" +readonly WORK_DIRECTORY="${2}" +readonly CLOUD="${3}" +# shellcheck disable=SC2034 +readonly LOG_IN="${4}" + +############################################## +# Get absolute paths of the script directories +############################################## +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +readonly SCRIPT_DIR +readonly CLOUD_SCRIPT_DIR="${SCRIPT_DIR}/${CLOUD}" + +###################################################### +# Change to /tmp to avoid leaving junk on file system. +###################################################### +cd /tmp + +################################################################## +# Send stdout and stderr from this script to a file for debugging. +################################################################## +readonly USER_WORKBENCH_CONFIG_DIR="${WORK_DIRECTORY}/.workbench" +readonly POST_STARTUP_OUTPUT_FILE="${USER_WORKBENCH_CONFIG_DIR}/remount-on-restart-output.txt" +exec >> "${POST_STARTUP_OUTPUT_FILE}" +exec 2>&1 + +############################## +# Import utility functions +############################## +source "${SCRIPT_DIR}/emit.sh" + +############################# +# CLI login +############################# +readonly RUN_AS_LOGIN_USER="sudo -u ${USER_NAME} bash -l -c" +if [[ "${LOG_IN}" == "true" ]] && ${RUN_AS_LOGIN_USER} "'{$WORKBENCH_INSTALL_PATH}' auth status 2>&1" | grep -q "NO USER LOGGED IN"; then + ${RUN_AS_LOGIN_USER} "'{$WORKBENCH_INSTALL_PATH}' auth login --mode=APP_DEFAULT_CREDENTIALS" +fi + +############################# +# Mount buckets +############################# +# shellcheck disable=SC2034 +source "${CLOUD_SCRIPT_DIR}/resource-mount.sh" diff --git a/features/src/wb-mcp-server/README.md b/features/src/wb-mcp-server/README.md new file mode 100644 index 000000000..23c7fb403 --- /dev/null +++ b/features/src/wb-mcp-server/README.md @@ -0,0 +1,170 @@ +# Workbench MCP Server + +MCP server that exposes Workbench APIs for AI agents to discover data, explore schemas, and build cohorts programmatically. + +## Installation + +Add to your `devcontainer.json`: + +```json +{ + "features": { + "ghcr.io/verily-src/workbench-app-devcontainers/wb-mcp-server:latest": {} + } +} +``` + +Rebuild your devcontainer. The server: +- Installs at `/opt/wb-mcp-server/wb-mcp-server` +- **Runs automatically as HTTP daemon** on port 9242 +- **Auto-configures Claude CLI and Gemini CLI** during installation + +## How It Works + +The server runs in **HTTP mode** as a persistent background service: + +- **No lazy loading** - tools are available immediately +- **Port 9242** - uncommon port to avoid conflicts +- **Starts via postStartCommand** after authentication completes +- **Pre-configured** with both Claude Code and Gemini CLI + +### Manual Setup (if needed) + +If auto-configuration failed, manually add the server: + +**Claude CLI:** +```bash +claude mcp add --transport http wb http://127.0.0.1:9242 +``` + +**Gemini CLI:** +```bash +gemini mcp add --scope user --transport http wb http://127.0.0.1:9242 +``` + +### Server Control + +```bash +# Start server +/opt/wb-mcp-server/start-server.sh + +# Stop server +/opt/wb-mcp-server/stop-server.sh + +# Check status +pgrep -f 'wb-mcp-server -http' +``` + +## Quick Examples + +### Find Available Data + +``` +"List all data collections I can access" +``` + +Uses `workspace_list_data_collections` to find data collection workspaces. + +### Explore Schema + +``` +"What entities are in the AoU_2024 underlay? Show me the person entity attributes" +``` + +Uses `underlay_list_entities` and `underlay_get_entity`. + +### Create Simple Cohort + +``` +"Create a cohort called 'seniors' with patients over 65 from the AoU_2024 data collection (workspace ID: abc-123) in my workspace (xyz-456)" +``` + +Uses `filter_build_attribute` and `cohort_create_in_workspace`. + +### Create Complex Cohort + +``` +"Create a cohort of diabetic seniors: patients over 65 with Type 2 Diabetes (concept 201826) from AoU_2024. Data collection: abc-123, target workspace: xyz-456, name: 'diabetic-seniors'" +``` + +Uses `filter_build_attribute`, `filter_build_relationship`, `filter_build_boolean_logic`, and `cohort_create_in_workspace`. + +## Internals + +### Authentication +- Auto-fetches bearer token from `wb auth print-access-token` +- Refreshes every 55 minutes +- Gets API URLs from `wb status` + +### Data Collections +Data collection workspaces contain underlays (data models): +- Data collection workspace ID = underlay ID +- Property `"terra-type": "data-collection"` +- Property `"terra-dx-underlay-name"` = underlay name (e.g., "AoU_2024") + +### Cohort Creation Flow +1. User has READ access to data collection workspace +2. User has WRITER access to target workspace +3. Server creates: + - Study in Data Explorer (if doesn't exist) + - Cohort in that study + - Controlled resource in workspace + +### Filter Structure +Filters use Data Explorer's filter format: +- **Attribute**: `age > 65`, `gender = 'male'` +- **Relationship**: `persons who have condition = diabetes` +- **Boolean Logic**: Combine with AND/OR/NOT +- **Hierarchy**: All descendants of concept + +Filter builders output correct JSON for you. + +## Troubleshooting + +### "Error: failed to get access token" +```bash +wb auth login +``` + +### "API error (403)" +Check permissions: +```bash +wb workspace describe +``` +Need READER on data collections, WRITER on target workspace. + +### "Error: underlayName parameter is required" +First find underlay names: +``` +"List my data collections and show their underlay names" +``` + +### Server not responding + +Check if the server is running: +```bash +pgrep -f 'wb-mcp-server -http' +``` + +If not running, start it: +```bash +/opt/wb-mcp-server/start-server.sh +``` + +Test the HTTP endpoint: +```bash +curl -X POST http://127.0.0.1:9242 \ + -H "Content-Type: application/json" \ + -d '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' +``` + +Check logs: +```bash +tail -f /tmp/wb-mcp-server.log +``` + +## Requirements + +- Workbench CLI (`wb`) installed +- Authenticated (`wb auth login`) +- Access to data collections and workspaces diff --git a/features/src/wb-mcp-server/devcontainer-feature.json b/features/src/wb-mcp-server/devcontainer-feature.json new file mode 100644 index 000000000..ba210a8a5 --- /dev/null +++ b/features/src/wb-mcp-server/devcontainer-feature.json @@ -0,0 +1,30 @@ +{ + "id": "wb-mcp-server", + "version": "2.0.0", + "name": "Workbench MCP Server", + "description": "Installs an MCP (Model Context Protocol) HTTP server that wraps the wb CLI, enabling AI assistants to interact with Workbench. Runs as a persistent daemon on port 9242, eliminating lazy-loading delays. Auto-configures Claude CLI and Gemini CLI.", + "options": { + "username": { + "type": "string", + "default": "root", + "description": "Username of the container user." + }, + "userHomeDir": { + "type": "string", + "default": "/root", + "description": "Home directory of the container user." + }, + "port": { + "type": "string", + "default": "9242", + "description": "Port for the HTTP MCP server" + } + }, + "installsAfter": [ + "ghcr.io/devcontainers/features/common-utils", + "ghcr.io/devcontainers/features/go", + "ghcr.io/anthropics/devcontainer-features/claude-code", + "./.devcontainer/features/gemini-cli", + "./.devcontainer/features/workbench-tools" + ] +} diff --git a/features/src/wb-mcp-server/go.mod b/features/src/wb-mcp-server/go.mod new file mode 100644 index 000000000..c1fe167a8 --- /dev/null +++ b/features/src/wb-mcp-server/go.mod @@ -0,0 +1,3 @@ +module github.com/verily-src/wb-mcp-server + +go 1.25 diff --git a/features/src/wb-mcp-server/install.sh b/features/src/wb-mcp-server/install.sh new file mode 100755 index 000000000..9e650216f --- /dev/null +++ b/features/src/wb-mcp-server/install.sh @@ -0,0 +1,244 @@ +#!/usr/bin/env bash + +# install.sh installs the Workbench MCP server in the devcontainer. + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +readonly USERNAME="${USERNAME:-"root"}" +USER_HOME_DIR="${USERHOMEDIR:-"/home/${USERNAME}"}" +if [[ "${USER_HOME_DIR}" == "/home/root" ]]; then + USER_HOME_DIR="/root" +fi +readonly USER_HOME_DIR + +readonly WB_MCP_PORT="${PORT:-"9242"}" + +export DEBIAN_FRONTEND=noninteractive +export TZ=Etc/UTC + +WORKDIR="$(mktemp -d)" +readonly WORKDIR + +readonly WB_MCP_DIR="/opt/wb-mcp-server" +readonly WB_MCP_BIN="${WB_MCP_DIR}/wb-mcp-server" + +function cleanup() { + rm -rf "${WORKDIR:?}" + rm -rf /var/lib/apt/lists/* +} + +trap 'cleanup' EXIT + +function apt_get_update() { + if [ "$(find /var/lib/apt/lists/* | wc -l)" = "0" ]; then + echo "Running apt-get update..." + apt-get update -y + fi +} + +# Checks if packages are installed and installs them if not +function check_packages() { + if ! dpkg -s "$@" > /dev/null 2>&1; then + apt_get_update + apt-get -y install --no-install-recommends "$@" + fi +} + +echo "Starting wb-mcp-server installation..." + +# Save the directory where the feature files are located +FEATURE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly FEATURE_DIR + +if ! type apt-get &>/dev/null; then + echo "Error: unable to find a supported package manager." + exit 1 +fi + +# Install required packages +check_packages \ + ca-certificates \ + curl \ + git + +# Check if Go is installed +if ! command -v go &> /dev/null; then + echo "Go is not installed. Installing Go 1.25..." + GOLANG_VERSION="1.25.0" + case "$(uname -m)" in + x86_64) GOLANG_ARCH="amd64" ;; + aarch64) GOLANG_ARCH="arm64" ;; + armv7l) GOLANG_ARCH="armv6l" ;; + *) echo "Unsupported architecture: $(uname -m)"; exit 1 ;; + esac + + cd "${WORKDIR}" + curl -fsSL "https://go.dev/dl/go${GOLANG_VERSION}.linux-${GOLANG_ARCH}.tar.gz" -o go.tar.gz + tar -C /usr/local -xzf go.tar.gz + export PATH="/usr/local/go/bin:${PATH}" +fi + +# Create installation directory +mkdir -p "${WB_MCP_DIR}" + +# Copy source files to temporary build directory +BUILD_DIR="${WORKDIR}/wb-mcp-server" +mkdir -p "${BUILD_DIR}" +cp "${FEATURE_DIR}/main.go" "${BUILD_DIR}/" +cp "${FEATURE_DIR}/go.mod" "${BUILD_DIR}/" + +# Build the Go binary +cd "${BUILD_DIR}" +go build -o "${WB_MCP_BIN}" main.go + +# Make it executable +chmod +x "${WB_MCP_BIN}" + +# Create systemd service file for optional automatic startup +cat > "${WB_MCP_DIR}/wb-mcp-server.service" < "${WB_MCP_DIR}/start-server.sh" < /dev/null; then + echo "wb-mcp-server is already running" + exit 0 +fi + +# Start server as the correct user (who has wb auth tokens) +if [ "\$(id -u)" = "0" ] && [ "\${RUN_USER}" != "root" ]; then + su - "\${RUN_USER}" -c "nohup \${WB_MCP_BIN} -http -port \${PORT} >> \${LOGFILE} 2>&1 &" +else + nohup "\${WB_MCP_BIN}" -http -port "\${PORT}" >> "\${LOGFILE}" 2>&1 & +fi +echo "Started wb-mcp-server on port \${PORT} as \${RUN_USER}" +echo "Logs: \${LOGFILE}" +EOF + +chmod +x "${WB_MCP_DIR}/start-server.sh" + +# Create a stop script +cat > "${WB_MCP_DIR}/stop-server.sh" <<'EOF' +#!/bin/bash +# Stop the wb-mcp-server HTTP daemon + +WB_MCP_BIN="/opt/wb-mcp-server/wb-mcp-server" + +if pgrep -f "${WB_MCP_BIN} -http" > /dev/null; then + pkill -f "${WB_MCP_BIN} -http" + echo "Stopped wb-mcp-server" +else + echo "wb-mcp-server is not running" +fi +EOF + +chmod +x "${WB_MCP_DIR}/stop-server.sh" + +# Create MCP configuration file for easy client setup (HTTP mode) +cat > "${WB_MCP_DIR}/mcp-config.json" < "${CLAUDE_SETTINGS}.tmp" \ + && mv "${CLAUDE_SETTINGS}.tmp" "${CLAUDE_SETTINGS}" +else + cat > "${CLAUDE_SETTINGS}" < "${GEMINI_SETTINGS}.tmp" \ + && mv "${GEMINI_SETTINGS}.tmp" "${GEMINI_SETTINGS}" +else + cat > "${GEMINI_SETTINGS}" </dev/null; then + { + echo "" + echo "# Workbench MCP Server - auto-start" + echo "if ! pgrep -f 'wb-mcp-server -http' > /dev/null 2>&1; then" + echo " /opt/wb-mcp-server/start-server.sh > /dev/null 2>&1" + echo "fi" + } >> "${USER_HOME_DIR}/.bashrc" +fi + +chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" + +echo "" +echo "wb-mcp-server installed at ${WB_MCP_BIN}" +echo "Port: ${WB_MCP_PORT}" +echo "Auto-starts on shell login" +echo "" + +echo "Done!" diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go new file mode 100644 index 000000000..9a7e740d3 --- /dev/null +++ b/features/src/wb-mcp-server/main.go @@ -0,0 +1,3520 @@ +package main + +import ( + "bufio" + "bytes" + "encoding/json" + "flag" + "fmt" + "io" + "log" + "net/http" + "os" + "os/exec" + "strings" + "time" +) + +// MCP Protocol structures +type JSONRPCRequest struct { + JSONRPC string `json:"jsonrpc"` + ID interface{} `json:"id,omitempty"` + Method string `json:"method"` + Params json.RawMessage `json:"params,omitempty"` +} + +type JSONRPCResponse struct { + JSONRPC string `json:"jsonrpc"` + ID interface{} `json:"id,omitempty"` + Result interface{} `json:"result,omitempty"` + Error *RPCError `json:"error,omitempty"` +} + +type RPCError struct { + Code int `json:"code"` + Message string `json:"message"` +} + +type InitializeParams struct { + ProtocolVersion string `json:"protocolVersion"` + Capabilities map[string]interface{} `json:"capabilities"` + ClientInfo ClientInfo `json:"clientInfo"` +} + +type ClientInfo struct { + Name string `json:"name"` + Version string `json:"version"` +} + +type ServerInfo struct { + Name string `json:"name"` + Version string `json:"version"` +} + +type InitializeResult struct { + ProtocolVersion string `json:"protocolVersion"` + Capabilities map[string]interface{} `json:"capabilities"` + ServerInfo ServerInfo `json:"serverInfo"` +} + +type ListToolsResult struct { + Tools []Tool `json:"tools"` +} + +type Tool struct { + Name string `json:"name"` + Description string `json:"description"` + InputSchema InputSchema `json:"inputSchema"` +} + +type InputSchema struct { + Type string `json:"type"` + Properties map[string]interface{} `json:"properties"` + Required []string `json:"required,omitempty"` +} + +type CallToolParams struct { + Name string `json:"name"` + Arguments map[string]interface{} `json:"arguments,omitempty"` +} + +type CallToolResult struct { + Content []ContentItem `json:"content"` + IsError bool `json:"isError,omitempty"` +} + +type ContentItem struct { + Type string `json:"type"` + Text string `json:"text"` +} + +// Global variables +var ( + workspaceBaseURL string + dataExplorerURL string + cachedWorkspaceUUID string // populated once at startup from wb status + httpClient = &http.Client{Timeout: 60 * time.Second} +) + +// Tool definitions +var wbTools = []Tool{ + { + Name: "wb_status", + Description: "Get workspace and server status using wb CLI", + InputSchema: InputSchema{Type: "object", Properties: map[string]interface{}{}}, + }, + { + Name: "wb_workspace_list", + Description: "List all workspaces using wb CLI", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "format": map[string]interface{}{ + "type": "string", + "enum": []string{"json", "text"}, + }, + }, + }, + }, + { + Name: "wb_execute", + Description: "Execute any wb command (without 'wb' prefix)", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "command": map[string]interface{}{"type": "string"}, + }, + Required: []string{"command"}, + }, + }, + + { + Name: "workspace_create", + Description: "Create a new workspace. Use this when user wants to create a new workspace for their research or project. Creates both the workspace metadata and backing cloud resources (e.g., Google Cloud project). Returns the new workspace ID.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "id": map[string]interface{}{"type": "string", "description": "User-facing workspace ID (must be unique)"}, + "podId": map[string]interface{}{"type": "string", "description": "Pod ID (required) - get from pod_list"}, + "name": map[string]interface{}{"type": "string", "description": "Display name for the workspace"}, + "description": map[string]interface{}{"type": "string", "description": "Workspace description"}, + "organizationId": map[string]interface{}{"type": "string", "description": "Organization ID (optional)"}, + }, + Required: []string{"id", "podId"}, + }, + }, + { + Name: "workspace_delete", + Description: "Delete a workspace. Use this when user wants to permanently remove a workspace. WARNING: This deletes all resources in the workspace. Requires OWNER role. User should confirm before executing.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID to delete"}, + }, + Required: []string{"workspaceId"}, + }, + }, + { + Name: "workspace_update", + Description: "Update workspace metadata (name, description). Use this when user wants to change workspace display name or description without modifying resources.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + "name": map[string]interface{}{"type": "string", "description": "New display name"}, + "description": map[string]interface{}{"type": "string", "description": "New description"}, + }, + Required: []string{"workspaceId"}, + }, + }, + { + Name: "workspace_duplicate", + Description: "Duplicate an existing workspace. Use this when user wants to copy a workspace structure (including resources and folder organization) to a new workspace. Useful for creating similar workspaces or templates.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "sourceWorkspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID to duplicate from"}, + "destWorkspaceId": map[string]interface{}{"type": "string", "description": "New workspace ID"}, + "name": map[string]interface{}{"type": "string", "description": "Name for new workspace"}, + }, + Required: []string{"sourceWorkspaceId", "destWorkspaceId"}, + }, + }, + { + Name: "workspace_set_property", + Description: "Set custom properties on a workspace. Use this for adding metadata tags or configuration values. Properties are key-value pairs used for organization, categorization, or workspace configuration.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + "key": map[string]interface{}{"type": "string", "description": "Property key"}, + "value": map[string]interface{}{"type": "string", "description": "Property value"}, + }, + Required: []string{"workspaceId", "key", "value"}, + }, + }, + { + Name: "workspace_delete_property", + Description: "Delete a custom property from a workspace. Use this to remove previously set metadata or configuration.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + "key": map[string]interface{}{"type": "string", "description": "Property key to delete"}, + }, + Required: []string{"workspaceId", "key"}, + }, + }, + { + Name: "workspace_add_user", + Description: "Grant a user access to a workspace. Use this when sharing a workspace with collaborators. Specify role (READER, WRITER, or OWNER) to control access level. READER can view, WRITER can modify, OWNER can manage users and delete.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email address"}, + "role": map[string]interface{}{"type": "string", "enum": []string{"READER", "WRITER", "OWNER"}, "description": "Access role"}, + }, + Required: []string{"workspaceId", "email", "role"}, + }, + }, + { + Name: "workspace_remove_user", + Description: "Revoke a user's access to a workspace. Use this to remove collaborators or revoke access. Requires OWNER role.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email to remove"}, + }, + Required: []string{"workspaceId", "email"}, + }, + }, + { + Name: "workspace_list_users", + Description: "List all users with access to a workspace and their roles. Use this to see who has access and what level of permissions they have.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + }, + Required: []string{"workspaceId"}, + }, + }, + + { + Name: "resource_create_bucket", + Description: "Create a cloud storage bucket in the workspace. Use this when user needs file storage for data, results, or shared files. Creates a managed bucket that workspace users can access based on their roles.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID (used to reference in workspace)"}, + "bucketName": map[string]interface{}{"type": "string", "description": "Cloud bucket name (globally unique)"}, + "description": map[string]interface{}{"type": "string", "description": "Resource description"}, + }, + Required: []string{"resourceId", "bucketName"}, + }, + }, + { + Name: "resource_create_bq_dataset", + Description: "Create a BigQuery dataset in the workspace. Use this when user needs a database for structured data analysis, SQL queries, or data warehousing.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID"}, + "datasetId": map[string]interface{}{"type": "string", "description": "BigQuery dataset ID"}, + "description": map[string]interface{}{"type": "string", "description": "Resource description"}, + }, + Required: []string{"resourceId", "datasetId"}, + }, + }, + { + Name: "resource_delete", + Description: "Delete a resource from the workspace. Use this to remove buckets, datasets, or other resources. For controlled resources, this deletes the actual cloud resource. For references, only removes the reference.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID to delete"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "resource_update", + Description: "Update resource metadata (name, description). Use this to change how a resource is displayed or documented without modifying the underlying cloud resource.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID"}, + "name": map[string]interface{}{"type": "string", "description": "New display name"}, + "description": map[string]interface{}{"type": "string", "description": "New description"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "resource_add_reference", + Description: "Add a reference to an external cloud resource. Use this when user wants to reference data/resources from outside the workspace (e.g., a bucket in another project, a shared dataset). Creates a pointer without managing the resource.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID for the reference"}, + "resourceType": map[string]interface{}{"type": "string", "enum": []string{"gcs-bucket", "bq-dataset", "bq-table"}, "description": "Type of resource"}, + "path": map[string]interface{}{"type": "string", "description": "Cloud path (e.g., gs://bucket-name)"}, + "description": map[string]interface{}{"type": "string", "description": "Reference description"}, + }, + Required: []string{"resourceId", "resourceType", "path"}, + }, + }, + { + Name: "resource_check_access", + Description: "Check if current user has access to a resource. Use this to verify permissions before attempting operations. Useful for debugging access issues or validating setup.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID to check"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "resource_move", + Description: "Move a resource to a different folder. Use this for organizing resources into logical groups within a workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID to move"}, + "folderId": map[string]interface{}{"type": "string", "description": "Destination folder ID"}, + }, + Required: []string{"resourceId", "folderId"}, + }, + }, + + { + Name: "folder_create", + Description: "Create a folder in the workspace. Use this to organize resources into logical groups (e.g., 'data', 'results', 'notebooks'). Folders help maintain clean workspace organization.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "folderId": map[string]interface{}{"type": "string", "description": "Folder ID (must be unique in workspace)"}, + "displayName": map[string]interface{}{"type": "string", "description": "Display name for folder"}, + "description": map[string]interface{}{"type": "string", "description": "Folder description"}, + "parentId": map[string]interface{}{"type": "string", "description": "Parent folder ID (for nested folders)"}, + }, + Required: []string{"folderId", "displayName"}, + }, + }, + { + Name: "folder_delete", + Description: "Delete a folder. Use this to remove folders no longer needed. NOTE: Folder must be empty (move or delete resources first).", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "folderId": map[string]interface{}{"type": "string", "description": "Folder ID to delete"}, + }, + Required: []string{"folderId"}, + }, + }, + { + Name: "folder_update", + Description: "Update folder metadata (name, description). Use this to rename folders or update descriptions for better organization.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "folderId": map[string]interface{}{"type": "string", "description": "Folder ID"}, + "displayName": map[string]interface{}{"type": "string", "description": "New display name"}, + "description": map[string]interface{}{"type": "string", "description": "New description"}, + }, + Required: []string{"folderId"}, + }, + }, + { + Name: "folder_list_tree", + Description: "Show folder hierarchy as a tree. Use this to visualize workspace organization and understand the folder structure.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "workspace_list_data_collections", + Description: `List all data collections in the current workspace and their associated resources. + +Use this when a user asks: +- "What data collections exist in my workspace?" +- "Show me resources grouped by data collection" +- "Which resources came from which data collections?" + +This tool automatically: +1. Gets all resources and identifies their sourceWorkspaceId (where they were cloned from) +2. Looks up each source workspace to get the actual data collection name +3. Groups resources by their source data collection +4. Shows resources created directly in this workspace (no source) + +Returns a structured list of data collections with their resources, types, and cloud paths.`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + + { + Name: "platform_list_data_collections", + Description: `Search and list all data collections accessible to the current user across all of Workbench — not just those attached to the active workspace. + +Use this when a user asks: +- "What data collections do I have access to?" +- "Find data collections related to " +- "Search for datasets across all of Workbench" +- "What datasets could I add to my workspace?" +- "Show me all accessible genomics / proteomics / clinical / imaging data" +- "Are there any data collections I haven't attached yet?" +- "Find me datasets about " + +This tool searches PLATFORM-WIDE. It returns all data collections the user has READ access to, +regardless of whether they are attached to the active workspace. + +Always tell the user upfront that this is a broader platform-wide search (not just their workspace). + +The keyword search matches against: name, description, data modality tags, therapeutic area tags, +and data model type — so queries like "imaging", "genomics", "oncology" will match relevant collections. + +Each result includes rich metadata sourced directly from the data collection: +- shortDescription, description, organization, availability, isFree, isInstantlyAccessible +- patientCount, timeFrame, geographicCoverage, dataModel +- dataModalityTags (e.g. imaging, lab-results, ecrf), therapeuticTags (e.g. oncology, general-health) +- underlayName (the data model identifier for schema exploration) +- dataDictionary (links to schema documentation) +- usageExamples (sample use cases and SQL queries) +- accessGroupName, supportEmail +- dataPublished, metadataLastUpdated, externalDocumentation + +Present results in a human-readable summary grouped by relevance. For each matching collection, +highlight the most relevant fields for the user's query (e.g. patient count and modality for +clinical searches, underlay name for schema exploration).`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "query": map[string]interface{}{ + "type": "string", + "description": "Optional keyword to filter data collections by name or description (case-insensitive substring match)", + }, + "limit": map[string]interface{}{ + "type": "number", + "description": "Maximum number of results to return (default: 100)", + }, + }, + }, + }, + + { + Name: "group_create", + Description: "Create a user group. Use this when managing multiple users with same access needs. Groups simplify permission management - grant access to group instead of individual users.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "groupId": map[string]interface{}{"type": "string", "description": "Unique group ID"}, + "name": map[string]interface{}{"type": "string", "description": "Group display name"}, + "description": map[string]interface{}{"type": "string", "description": "Group description"}, + }, + Required: []string{"groupId", "name"}, + }, + }, + { + Name: "group_delete", + Description: "Delete a user group. Use this to remove groups no longer needed. Users in the group lose group-based permissions.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "groupId": map[string]interface{}{"type": "string", "description": "Group ID to delete"}, + }, + Required: []string{"groupId"}, + }, + }, + { + Name: "group_list", + Description: "List all groups the current user has a role on. Use this to see available groups for permission management.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "group_describe", + Description: "Get detailed information about a group (members, roles, metadata). Use this to see who belongs to a group and their access levels.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "groupId": map[string]interface{}{"type": "string", "description": "Group ID"}, + }, + Required: []string{"groupId"}, + }, + }, + { + Name: "group_add_user", + Description: "Add a user to a group. Use this when adding collaborators to a group for shared access management.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "groupId": map[string]interface{}{"type": "string", "description": "Group ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email to add"}, + "role": map[string]interface{}{"type": "string", "enum": []string{"MEMBER", "ADMIN"}, "description": "Role in group"}, + }, + Required: []string{"groupId", "email", "role"}, + }, + }, + { + Name: "group_remove_user", + Description: "Remove a user from a group. Use this to revoke group membership and associated permissions.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "groupId": map[string]interface{}{"type": "string", "description": "Group ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email to remove"}, + }, + Required: []string{"groupId", "email"}, + }, + }, + + { + Name: "app_create", + Description: "Create a GCP Compute Engine application in the workspace. Use this to launch analysis environments like JupyterLab, RStudio, or VSCode. Applications provide interactive compute environments.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "appId": map[string]interface{}{"type": "string", "description": "Application ID"}, + "appConfig": map[string]interface{}{"type": "string", "description": "App config name. Valid values: jupyter-lab, r-analysis, visual-studio-code"}, + "machineType": map[string]interface{}{"type": "string", "description": "Machine type (e.g., 'n1-standard-4')"}, + "description": map[string]interface{}{"type": "string", "description": "Description of the app"}, + "location": map[string]interface{}{"type": "string", "description": "GCP location/zone"}, + }, + Required: []string{"appId", "appConfig"}, + }, + }, + { + Name: "app_delete", + Description: "Delete an application. Use this to remove applications no longer needed. Stops the application and deletes associated resources.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "appId": map[string]interface{}{"type": "string", "description": "Application ID to delete"}, + }, + Required: []string{"appId"}, + }, + }, + { + Name: "app_list", + Description: "List all applications in the workspace. Use this to see available applications, their status, and configuration.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "app_start", + Description: "Start a stopped application. Use this to resume an application that was stopped to save costs. Takes a few minutes to become ready.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "appId": map[string]interface{}{"type": "string", "description": "Application ID to start"}, + }, + Required: []string{"appId"}, + }, + }, + { + Name: "app_stop", + Description: "Stop a running application. Use this to pause an application to save compute costs. Data and state are preserved. Can be restarted later.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "appId": map[string]interface{}{"type": "string", "description": "Application ID to stop"}, + }, + Required: []string{"appId"}, + }, + }, + { + Name: "app_get_url", + Description: "Get the launch URL for an application. Use this to get the web address to access a running application (e.g., Jupyter notebook URL).", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "appId": map[string]interface{}{"type": "string", "description": "Application ID"}, + }, + Required: []string{"appId"}, + }, + }, + + { + Name: "auth_status", + Description: "Get current authentication status. Use this to check if user is logged in and see which account is active.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + + { + Name: "server_list", + Description: "List all available servers. Use this to see which server environments are available (dev, staging, production).", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "server_set", + Description: "Set which server to connect to. Use this to switch between different environments (e.g., from production to staging for testing).", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "serverName": map[string]interface{}{"type": "string", "description": "Server name to connect to"}, + }, + Required: []string{"serverName"}, + }, + }, + { + Name: "server_status", + Description: "Get server status and details. Use this to check server health and configuration information.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "server_list_regions", + Description: "List valid cloud regions for a platform. Use this when creating resources to see available regions.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "cloudPlatform": map[string]interface{}{"type": "string", "description": "Cloud platform (e.g., 'gcp', 'azure')"}, + }, + Required: []string{"cloudPlatform"}, + }, + }, + + { + Name: "pod_list", + Description: "List all pods. Use this to see available pods (environments/tenants) and their details.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "pod_describe", + Description: "Get detailed information about a pod. Use this to see pod configuration, users, and settings.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "podId": map[string]interface{}{"type": "string", "description": "Pod ID"}, + }, + Required: []string{"podId"}, + }, + }, + { + Name: "pod_role_list", + Description: "List all user roles in a pod. Use this to see who has access to a pod and their permission levels.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "organizationId": map[string]interface{}{"type": "string", "description": "Organization ID"}, + "podId": map[string]interface{}{"type": "string", "description": "Pod ID"}, + }, + Required: []string{"organizationId", "podId"}, + }, + }, + { + Name: "pod_role_grant", + Description: "Grant a user a role in a pod. Use this when adding users to a pod with specific permissions.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "organizationId": map[string]interface{}{"type": "string", "description": "Organization ID"}, + "podId": map[string]interface{}{"type": "string", "description": "Pod ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email"}, + "role": map[string]interface{}{"type": "string", "description": "Role to grant (ADMIN, USER, SUPPORT)"}, + }, + Required: []string{"organizationId", "podId", "email", "role"}, + }, + }, + { + Name: "pod_role_revoke", + Description: "Revoke a user's role in a pod. Use this to remove a user's access to a pod.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "organizationId": map[string]interface{}{"type": "string", "description": "Organization ID"}, + "podId": map[string]interface{}{"type": "string", "description": "Pod ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email"}, + "role": map[string]interface{}{"type": "string", "description": "Role to revoke (ADMIN, USER, SUPPORT)"}, + }, + Required: []string{"organizationId", "podId", "email", "role"}, + }, + }, + + { + Name: "organization_list", + Description: "List all organizations. Use this to see available organizations and their details.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + + { + Name: "resource_credentials", + Description: "Get temporary credentials for accessing a cloud resource. Use this when you need programmatic access credentials (e.g., for scripts, external tools).", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID"}, + "duration": map[string]interface{}{"type": "integer", "description": "Credential duration in seconds"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "resource_open_console", + Description: "Get cloud console link for a resource. Use this to provide users with a web link to view/manage the resource in the cloud provider's console.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "resource_list_tree", + Description: "List resources in tree view showing folder hierarchy. Use this to visualize workspace organization with resources grouped by folders.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "resource_mount", + Description: "Mount workspace bucket resources to local filesystem. Use this when user needs to access bucket contents as if they were local files.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "resource_unmount", + Description: "Unmount workspace bucket resources. Use this to disconnect previously mounted buckets from local filesystem.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + + { + Name: "notebook_start", + Description: "Start a stopped notebook instance. Use this to resume a notebook that was stopped to save costs. Convenience wrapper for app start.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "notebookId": map[string]interface{}{"type": "string", "description": "Notebook instance ID"}, + }, + Required: []string{"notebookId"}, + }, + }, + { + Name: "notebook_stop", + Description: "Stop a running notebook instance. Use this to pause a notebook to save compute costs. Convenience wrapper for app stop.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "notebookId": map[string]interface{}{"type": "string", "description": "Notebook instance ID"}, + }, + Required: []string{"notebookId"}, + }, + }, + { + Name: "notebook_launch", + Description: "Launch a running notebook instance. Use this to get the URL and open a notebook. Convenience wrapper for app launch.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "notebookId": map[string]interface{}{"type": "string", "description": "Notebook instance ID"}, + }, + Required: []string{"notebookId"}, + }, + }, + + { + Name: "cluster_start", + Description: "Start a stopped Dataproc cluster. Use this to resume a Spark cluster that was stopped to save costs.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "clusterId": map[string]interface{}{"type": "string", "description": "Cluster ID"}, + }, + Required: []string{"clusterId"}, + }, + }, + { + Name: "cluster_stop", + Description: "Stop a running Dataproc cluster. Use this to pause a Spark cluster to save compute costs.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "clusterId": map[string]interface{}{"type": "string", "description": "Cluster ID"}, + }, + Required: []string{"clusterId"}, + }, + }, + { + Name: "cluster_launch", + Description: "Launch Dataproc cluster proxy view. Use this to get the URL for accessing cluster monitoring and Spark UI.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "clusterId": map[string]interface{}{"type": "string", "description": "Cluster ID"}, + }, + Required: []string{"clusterId"}, + }, + }, + + { + Name: "workflow_list", + Description: "List all workflows. Use this to see available workflows in the workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + }, + Required: []string{"workspaceId"}, + }, + }, + { + Name: "workflow_create", + Description: "Create a new workflow. Use this when user wants to set up a workflow for data processing or analysis pipelines.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + "workflowId": map[string]interface{}{"type": "string", "description": "Workflow ID"}, + "bucketId": map[string]interface{}{"type": "string", "description": "BUCKET NAME (not UUID) - e.g., 'cohort_exports'. Get from workspace_list_resources metadata.name field."}, + "path": map[string]interface{}{"type": "string", "description": "Path to workflow definition file in bucket (e.g., 'workflows/myworkflow.wdl')"}, + "displayName": map[string]interface{}{"type": "string", "description": "Workflow display name"}, + "description": map[string]interface{}{"type": "string", "description": "Description of the workflow"}, + }, + Required: []string{"workspaceId", "workflowId", "bucketId", "path"}, + }, + }, + { + Name: "workflow_describe", + Description: "Get detailed information about a workflow. Use this to see workflow configuration and status.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + "workflowId": map[string]interface{}{"type": "string", "description": "Workflow ID"}, + }, + Required: []string{"workspaceId", "workflowId"}, + }, + }, + { + Name: "workflow_job_list", + Description: "List all workflow jobs. Use this to see job history, status, and details.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "workflow_job_describe", + Description: "Get detailed information about a workflow job. Use this to see job configuration, status, inputs, and outputs.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + "jobId": map[string]interface{}{"type": "string", "description": "Job ID"}, + }, + Required: []string{"workspaceId", "jobId"}, + }, + }, + { + Name: "workflow_job_run", + Description: "Start a workflow job. Use this to execute a workflow with specific inputs.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + "workflowId": map[string]interface{}{"type": "string", "description": "Workflow ID"}, + "outputBucketId": map[string]interface{}{"type": "string", "description": "BUCKET NAME (not UUID) for outputs - e.g., 'cohort_exports'"}, + "jobId": map[string]interface{}{"type": "string", "description": "Optional job ID"}, + "description": map[string]interface{}{"type": "string", "description": "Job description"}, + "outputPath": map[string]interface{}{"type": "string", "description": "Output path in bucket"}, + "inputs": map[string]interface{}{"type": "object", "description": "Job inputs as key-value pairs"}, + }, + Required: []string{"workspaceId", "workflowId", "outputBucketId"}, + }, + }, + { + Name: "workflow_job_cancel", + Description: "Cancel a running workflow job. Use this to stop a job that is in progress.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + "jobId": map[string]interface{}{"type": "string", "description": "Job ID"}, + }, + Required: []string{"workspaceId", "jobId"}, + }, + }, + + { + Name: "cromwell_generate_config", + Description: "Generate Cromwell configuration file. Use this when setting up Cromwell workflows to create the required config file.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "path": map[string]interface{}{"type": "string", "description": "Output path for cromwell.conf"}, + }, + Required: []string{"path"}, + }, + }, + { + Name: "workspace_configure_aws", + Description: "Generate AWS configuration file for workspace. Use this when workspace needs to access AWS resources.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + }, + Required: []string{"workspaceId"}, + }, + }, + { + Name: "resolve", + Description: "Resolve a resource to its cloud ID or path. Use this to get the actual cloud identifier (bucket name, dataset ID, etc.) for a workspace resource.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID to resolve"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "version", + Description: "Get the installed wb CLI version. Use this to check which version is installed or for troubleshooting.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + + { + Name: "bq_execute", + Description: "Execute BigQuery command in workspace context. Use this to run bq CLI commands with workspace's BigQuery access.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "command": map[string]interface{}{"type": "string", "description": "BigQuery command (without 'bq' prefix)"}, + }, + Required: []string{"command"}, + }, + }, + { + Name: "gcloud_execute", + Description: "Execute gcloud command in workspace context. Use this to run gcloud CLI commands with workspace's GCP project.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "command": map[string]interface{}{"type": "string", "description": "gcloud command (without 'gcloud' prefix)"}, + }, + Required: []string{"command"}, + }, + }, + { + Name: "gsutil_execute", + Description: "Execute gsutil command in workspace context. Use this to run gsutil CLI commands for GCS operations.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "command": map[string]interface{}{"type": "string", "description": "gsutil command (without 'gsutil' prefix)"}, + }, + Required: []string{"command"}, + }, + }, + { + Name: "git_execute", + Description: "Execute git command in workspace context. Use this for git operations within workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "command": map[string]interface{}{"type": "string", "description": "git command (without 'git' prefix)"}, + }, + Required: []string{"command"}, + }, + }, + + { + Name: "workspace_list_all", + Description: "List all workspaces with optional property filters. Use properties={'terra-type': 'data-collection'} to find data collections with underlays, properties={'terra-dx-underlay-name': ''} to filter by underlay", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "properties": map[string]interface{}{"type": "object"}, + "limit": map[string]interface{}{"type": "integer", "default": 100}, + "offset": map[string]interface{}{"type": "integer", "default": 0}, + }, + }, + }, + { + Name: "workspace_get", + Description: "Get workspace details by ID. workspaceId is the user-facing ID (e.g., 'test-1599'), not the UUID.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID (e.g., 'test-1599')"}, + }, + Required: []string{"workspaceId"}, + }, + }, + { + Name: "workspace_list_resources", + Description: "List all resources in a workspace including cohorts, buckets, datasets, etc. workspaceId is the user-facing ID (e.g., 'test-1599'), not the UUID.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID (e.g., 'test-1599')"}, + "offset": map[string]interface{}{"type": "integer", "default": 0}, + "limit": map[string]interface{}{"type": "integer", "default": 100}, + }, + Required: []string{"workspaceId"}, + }, + }, + + { + Name: "underlay_list", + Description: "List all available underlays", + InputSchema: InputSchema{Type: "object", Properties: map[string]interface{}{}}, + }, + { + Name: "underlay_get_schema", + Description: "Get complete underlay schema with entities and attributes. This returns the raw schema. For cohort building, use underlay_list_criteria_selectors instead to get available criteria selectors.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "underlayName": map[string]interface{}{"type": "string"}, + }, + Required: []string{"underlayName"}, + }, + }, + { + Name: "underlay_list_entities", + Description: "List all entities in an underlay (e.g., Person, Condition)", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "underlayName": map[string]interface{}{"type": "string"}, + }, + Required: []string{"underlayName"}, + }, + }, + { + Name: "underlay_get_entity", + Description: "Get entity details including attributes and relationships", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "underlayName": map[string]interface{}{"type": "string"}, + "entityName": map[string]interface{}{"type": "string"}, + }, + Required: []string{"underlayName", "entityName"}, + }, + }, + { + Name: "underlay_list_criteria_selectors", + Description: `STEP 1 of cohort creation: Discover available criteria selectors for an underlay. + +Returns array of selectors, each with: +- name: Selector name (use in selectorOrModifierName) +- plugin: Plugin type (use in pluginName) +- pluginConfig: JSON string (copy to uiConfig when building criteria) +- category: Display category +- displayName: Human-readable name + +EXTRACT from each selector: +1. selector.name → save for selectorOrModifierName +2. selector.plugin → save for pluginName +3. selector.pluginConfig → save as uiConfig (keep as JSON string) + +For "entityGroup" plugin selectors: +- Parse pluginConfig to extract classificationEntityGroups[0].id (e.g., "currentDiagnosesPerson") +- This is the entityGroup value needed in selectionData +- Parse columns to find entity's ID field name for data_query_hints + +COMPLETE COHORT WORKFLOW: +STEP 1: Call underlay_list_criteria_selectors(underlayName) → get selectors +STEP 2: Call cohort_create_in_workspace(workspaceId, underlayId, underlayName, name) WITHOUT criteriaJson → creates cohort with all participants +STEP 3: Extract studyId and cohortId from response +STEP 4: Call data_query_hints(studyId, cohortId, entityName) → get entity codes/values AND numeric ranges +STEP 5: Build criteriaJson using selector info + codes/ranges from hints +STEP 6: Call cohort_update_criteria(studyId, cohortId, criteriaJson) → apply filters + +LEARNING CORRECT FORMATS: +Use study_list_cohorts to examine existing cohorts and see their actual criteriaGroupSections. +This is the BEST way to learn correct selectionData formats for each plugin type. + +selectionData format by plugin type (see proto definitions in data-explorer repo): +- "attribute": {"dataRanges":[{"min":,"max":}]} - BOTH min and max required as numbers +- "entityGroup": {"selected": [{"key": {"int64Key": }, "name": "", "entityGroup": ""}]} +- "multiAttribute": {"selected": [{"attribute": "", "dataRanges": [{"min":,"max":}]}]}`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "underlayName": map[string]interface{}{"type": "string", "description": "Underlay name"}, + }, + Required: []string{"underlayName"}, + }, + }, + + { + Name: "data_query_hints", + Description: `STEP 4 of cohort workflow: Discover entity codes, value distributions, and numeric ranges. + +Use this to find: +1. Entity codes for entityGroup filters (diagnosis IDs, medication IDs, etc.) +2. Enum values for categorical attributes +3. Numeric ranges (min/max) for numeric attributes like age + +INPUT: +- studyId, cohortId: From cohort_create_in_workspace response +- entityName: Entity to query (e.g., "person", "diagnoses", "medications") + +RESPONSE STRUCTURE - displayHints array with elements containing: +{ + "attribute": {"name": "", "dataType": "INT64|STRING|..."}, + "displayHint": { + "numericRangeHint": {"min": , "max": } // For numeric attributes + OR + "enumHint": {"enumHintValues": [...]} // For categorical attributes + } +} + +CRITICAL: For numeric attributes (like age): +- Response includes "numericRangeHint" with actual data min/max values +- Use these EXACT min/max values in your selectionData dataRanges +- BOTH min and max are REQUIRED in dataRanges (see DataRange proto) +- Adjust min or max to create your filter (e.g., if max=92, use min=66,max=92 for "over 65") + +For entityGroup attributes: +- Look for instances with ID fields +- Extract ID value for int64Key and name for display + +After getting hints, proceed to STEP 5: Build criteriaJson, then STEP 6: cohort_update_criteria.`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace response"}, + "cohortId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace response"}, + "entityName": map[string]interface{}{"type": "string", "description": "Entity name (e.g., 'diagnoses', 'medications', 'person')"}, + }, + Required: []string{"studyId", "cohortId", "entityName"}, + }, + }, + { + Name: "data_sample_instances", + Description: "Sample actual data from an entity with optional filters", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string"}, + "cohortId": map[string]interface{}{"type": "string"}, + "entityName": map[string]interface{}{"type": "string"}, + "includeAttributes": map[string]interface{}{"type": "array", "items": map[string]interface{}{"type": "string"}}, + "filter": map[string]interface{}{"type": "object"}, + "limit": map[string]interface{}{"type": "integer", "default": 50}, + }, + Required: []string{"studyId", "cohortId", "entityName"}, + }, + }, + { + Name: "study_list", + Description: `List all Data Explorer studies. Use this to find studyId for existing cohorts. + +WHEN TO USE: +- When you need to find studyId/cohortId for an existing cohort +- When you want to see what studies exist in the workspace +- BEFORE calling data_query_hints or cohort_update_criteria on existing cohorts + +RESPONSE contains array of studies with: +- id: The studyId (UUID) needed for other API calls +- displayName: Usually "Workspace: " +- properties.externalId: The workspace UUID +- created, createdBy, lastModified, lastModifiedBy + +WORKFLOW to find existing cohort IDs: +1. Call study_list to get all studies +2. For each study, call study_list_cohorts(studyId) to list cohorts +3. Find cohort by displayName or underlayName +4. Extract studyId and cohortId for use in other tools`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "offset": map[string]interface{}{"type": "integer", "default": 0, "description": "Number of items to skip"}, + "limit": map[string]interface{}{"type": "integer", "default": 50, "description": "Maximum items to return"}, + }, + }, + }, + { + Name: "study_list_cohorts", + Description: `List all cohorts in a Data Explorer study. Use this to find cohortId and view actual criteria. + +WHEN TO USE: +- After calling study_list to get a studyId +- When you want to see what cohorts exist in a study +- When you want to examine the actual criteriaGroupSections used in existing cohorts +- To learn correct selectionData formats by looking at working cohorts + +RESPONSE contains array of cohorts with: +- id: The cohortId (UUID) needed for data_query_hints, cohort_update_criteria +- underlayName: Which underlay this cohort uses +- displayName: Human-readable cohort name +- description: Cohort description +- criteriaGroupSections: The ACTUAL criteria used (great for learning correct formats!) +- created, createdBy, lastModified, lastModifiedBy + +LEARNING FROM EXISTING COHORTS: +The response shows the exact criteriaGroupSections that work. Look at: +- selectionData format for each plugin type +- How selectorOrModifierName is used +- How uiConfig is structured +This is the BEST way to learn correct formats - copy from working cohorts! + +WORKFLOW: +1. Call study_list to get studyId +2. Call THIS tool with studyId to list cohorts +3. Extract cohortId for the cohort you want to work with +4. Optionally: Study the criteriaGroupSections to learn correct formats`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "Study ID from study_list"}, + "offset": map[string]interface{}{"type": "integer", "default": 0, "description": "Number of items to skip"}, + "limit": map[string]interface{}{"type": "integer", "default": 50, "description": "Maximum items to return"}, + }, + Required: []string{"studyId"}, + }, + }, + + { + Name: "cohort_create_in_workspace", + Description: `STEP 2 of cohort workflow: Create cohort in workspace. + +TWO MODES: +1. WITHOUT criteriaJson (RECOMMENDED for new underlays): Creates cohort with all participants + - Use this to create initial cohort for discovering entity codes + - Then use data_query_hints to get codes + - Then use cohort_update_criteria to apply filters + +2. WITH criteriaJson: Creates cohort with filters already applied + - Only use if you already know all selector names and entity codes + +RESPONSE contains studyId and cohortId at top level: +{ + "studyId": "abc-123", + "cohortId": "def-456", + "resourceId": "...", + ... +} + +Extract these for next steps: +- studyId: Needed for data_query_hints and cohort_update_criteria +- cohortId: Needed for data_query_hints and cohort_update_criteria + +RECOMMENDED WORKFLOW (for unknown underlay): +1. Call underlay_list_criteria_selectors → get selectors +2. Call THIS tool WITHOUT criteriaJson → creates "all participants" cohort +3. Extract studyId and cohortId from response +4. Call data_query_hints(studyId, cohortId, entityName) → get entity codes +5. Build criteriaJson with discovered selectors and codes +6. Call cohort_update_criteria(studyId, cohortId, criteriaJson) → apply filters + +criteriaJson structure (if providing): +{ + "criteriaGroupSections": [{ + "id": "section-id", + "displayName": "Section Name", + "disabled": false, + "operator": "AND", + "excluded": false, + "firstBlockReducingOperator": "ANY", + "secondBlockReducingOperator": "ANY", + "secondBlockCriteriaGroups": [], + "criteriaGroups": [{ + "id": "group-id", + "disabled": false, + "criteria": [{ + "id": "criteria-id", + "pluginName": "", + "selectorOrModifierName": "", + "selectionData": "", + "uiConfig": "", + "pluginVersion": 0, + "tags": {}, + "enabled": true + }] + }] + }] +} + +Each criterion in separate criteriaGroup. See underlay_list_criteria_selectors for selectionData formats. +Use study_list_cohorts to examine working cohorts and learn correct formats by example.`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID (e.g., 'test-1599')"}, + "underlayId": map[string]interface{}{"type": "string"}, + "underlayName": map[string]interface{}{"type": "string"}, + "name": map[string]interface{}{"type": "string"}, + "displayName": map[string]interface{}{"type": "string"}, + "description": map[string]interface{}{"type": "string"}, + "criteriaJson": map[string]interface{}{"type": "string", "description": "Complete criteriaGroupSections JSON (see tool description for required structure)"}, + "folderId": map[string]interface{}{"type": "string"}, + }, + Required: []string{"workspaceId", "underlayId", "underlayName", "name"}, + }, + }, + { + Name: "cohort_update_criteria", + Description: `STEP 6 of cohort workflow: Apply filter criteria to existing cohort. + +This is the final step after discovering selectors, creating initial cohort, and querying entity codes. + +INPUT: +- studyId, cohortId: From cohort_create_in_workspace response +- criteriaGroupSections: Array of criteria group sections (see structure below) + +BUILD criteriaGroupSections array: +[{ + "id": "section-1", + "displayName": "Filters", + "disabled": false, + "operator": "AND", + "excluded": false, + "firstBlockReducingOperator": "ANY", + "secondBlockReducingOperator": "ANY", + "secondBlockCriteriaGroups": [], + "criteriaGroups": [ + { + "id": "group-1", + "disabled": false, + "criteria": [{ + "id": "crit-1", + "pluginName": "", + "selectorOrModifierName": "", + "selectionData": "", + "uiConfig": "", + "pluginVersion": 0, + "tags": {}, + "enabled": true + }] + } + ] +}] + +BUILDING selectionData by plugin type: +1. "attribute" plugin (numeric attributes like age): + - Format: "{\"dataRanges\":[{\"min\":66,\"max\":92}]}" + - BOTH min and max REQUIRED as numbers (not strings) + - Get min/max from data_query_hints numericRangeHint response + - Escape as JSON string when putting in criteria + +2. "entityGroup" plugin (diagnoses, medications): + - Use codes from data_query_hints response + - Format: "{\"selected\":[{\"key\":{\"int64Key\":CODE},\"name\":\"NAME\",\"entityGroup\":\"GROUP_ID\"}]}" + - int64Key value must be NUMBER not string + - entityGroup ID from selector's pluginConfig classificationEntityGroups[0].id + +3. "multiAttribute" plugin: + - Format: "{\"selected\":[{\"attribute\":\"ATTR\",\"dataRanges\":[{\"min\":NUM,\"max\":NUM}]}]}" + - For categorical: "{\"selected\":[{\"attribute\":\"ATTR\",\"values\":[{\"value\":{\"stringVal\":\"VALUE\"}}]}]}" + +CRITICAL: +- Each criterion goes in its own criteriaGroup. Operator "AND" means all groups must match. +- Use study_list_cohorts to examine working cohorts and learn correct formats.`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace response"}, + "cohortId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace response"}, + "criteriaGroupSections": map[string]interface{}{"type": "array", "description": "Array of criteria group sections", "items": map[string]interface{}{"type": "object"}}, + "displayName": map[string]interface{}{"type": "string", "description": "Optional: Update cohort display name"}, + "description": map[string]interface{}{"type": "string", "description": "Optional: Update cohort description"}, + }, + Required: []string{"studyId", "cohortId"}, + }, + }, + { + Name: "cohort_count_instances", + Description: "Count instances matching cohort criteria", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string"}, + "cohortId": map[string]interface{}{"type": "string"}, + "entity": map[string]interface{}{"type": "string"}, + "groupByAttributes": map[string]interface{}{"type": "array", "items": map[string]interface{}{"type": "string"}}, + }, + Required: []string{"studyId", "cohortId"}, + }, + }, + + { + Name: "export_list_models", + Description: `List available export models for an underlay. + +Export models define how cohort data can be exported to different formats (CSV, IPYNB, etc.). + +RESPONSE contains array of export models with: +- name: Export model identifier (use in export_cohort) +- displayName: Human-readable name +- description: What this export model does +- numPrimaryEntityCap: Maximum number of entities that can be exported`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "underlayName": map[string]interface{}{"type": "string", "description": "Underlay name"}, + }, + Required: []string{"underlayName"}, + }, + }, + { + Name: "export_describe", + Description: `Describe what will be included in a cohort export. + +Shows which entities and attributes will be exported based on cohort variable set or all criteria. + +INPUT: +- studyId, cohortId: From cohort_create_in_workspace +- allCriteriaFromCohort: If true, exports all criteria; if false (default), exports variable set`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "cohortId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "allCriteriaFromCohort": map[string]interface{}{"type": "boolean", "description": "Export all criteria (true) or variable set (false)"}, + }, + Required: []string{"studyId", "cohortId"}, + }, + }, + { + Name: "export_preview", + Description: `Preview what data will be exported before running the actual export. + +Shows sample instances that will be included in the export. + +INPUT: +- studyId, cohortId: From cohort_create_in_workspace +- exportModel: Export model name from export_list_models +- entityName: Entity to preview (e.g., "person", "diagnoses") +- limit: Max instances to preview (default: 20, max: 20) +- inputs: Optional parameters required by export model`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "cohortId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "exportModel": map[string]interface{}{"type": "string", "description": "Export model name from export_list_models"}, + "entityName": map[string]interface{}{"type": "string", "description": "Entity to preview"}, + "limit": map[string]interface{}{"type": "integer", "description": "Max instances (default: 20)", "maximum": 20}, + "inputs": map[string]interface{}{"type": "object", "description": "Export model input parameters"}, + }, + Required: []string{"studyId", "cohortId"}, + }, + }, + { + Name: "export_cohort", + Description: `Export cohort data using specified export model. + +Creates downloadable files (CSV, IPYNB, etc.) with cohort data. + +INPUT: +- studyId, cohortId: From cohort_create_in_workspace +- exportRequests: Array of export requests, each with: + - exportModel: Model name from export_list_models (REQUIRED) + - inputs: Model-specific parameters (optional) + - includeAnnotations: Include review annotations (default: true) + - compressFiles: Compress output files (default: true) + +RESPONSE contains array of export results with: +- status: "SUCCEEDED" or "FAILED" +- links: Download URLs for exported files +- error: Error message if failed + +WORKFLOW: +1. Call export_list_models to see available models +2. Call export_preview to preview what will be exported +3. Call THIS tool to create the export +4. Use links from response to download files`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "cohortId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "exportRequests": map[string]interface{}{ + "type": "array", + "description": "Array of export requests", + "items": map[string]interface{}{ + "type": "object", + "properties": map[string]interface{}{ + "exportModel": map[string]interface{}{"type": "string", "description": "Export model name"}, + "inputs": map[string]interface{}{"type": "object", "description": "Model input parameters"}, + "includeAnnotations": map[string]interface{}{"type": "boolean", "default": true}, + "compressFiles": map[string]interface{}{"type": "boolean", "default": true}, + }, + "required": []string{"exportModel"}, + }, + }, + }, + Required: []string{"studyId", "cohortId", "exportRequests"}, + }, + }, + + { + Name: "filter_build_attribute", + Description: "Build attribute filter (e.g., age > 65). For cohort creation, use the criteriaGroupSections structure in cohort_create_in_workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "attribute": map[string]interface{}{"type": "string"}, + "operator": map[string]interface{}{"type": "string", "enum": []string{"EQUALS", "NOT_EQUALS", "LESS_THAN", "GREATER_THAN", "LESS_THAN_OR_EQUAL", "GREATER_THAN_OR_EQUAL", "IN", "NOT_IN", "BETWEEN", "IS_NULL", "IS_NOT_NULL"}}, + "value": map[string]interface{}{}, + "values": map[string]interface{}{"type": "array", "items": map[string]interface{}{}}, + "dataType": map[string]interface{}{"type": "string", "enum": []string{"BOOLEAN", "INT64", "STRING", "DATE", "TIMESTAMP", "DOUBLE"}}, + }, + Required: []string{"attribute", "operator", "dataType"}, + }, + }, + { + Name: "filter_build_relationship", + Description: "Build relationship filter (e.g., persons with condition). For cohort creation, use the criteriaGroupSections structure in cohort_create_in_workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "relatedEntity": map[string]interface{}{"type": "string"}, + "subfilter": map[string]interface{}{"type": "object"}, + }, + Required: []string{"relatedEntity"}, + }, + }, + { + Name: "filter_build_boolean_logic", + Description: "Combine filters with AND/OR/NOT. For cohort creation, use the criteriaGroupSections structure in cohort_create_in_workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "operator": map[string]interface{}{"type": "string", "enum": []string{"AND", "OR", "NOT"}}, + "subfilters": map[string]interface{}{"type": "array", "items": map[string]interface{}{"type": "object"}}, + }, + Required: []string{"operator", "subfilters"}, + }, + }, + { + Name: "filter_build_hierarchy", + Description: "Build hierarchy filter (e.g., all descendants of concept). For cohort creation, use the criteriaGroupSections structure in cohort_create_in_workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "hierarchy": map[string]interface{}{"type": "string"}, + "operator": map[string]interface{}{"type": "string", "enum": []string{"CHILD_OF", "DESCENDANT_OF_INCLUSIVE", "IS_ROOT", "IS_MEMBER", "IS_LEAF"}}, + "values": map[string]interface{}{"type": "array", "items": map[string]interface{}{}}, + }, + Required: []string{"hierarchy", "operator"}, + }, + }, +} + +func initializeConfig() error { + // Default to production Verily URLs + workspaceBaseURL = "https://workbench.verily.com/api/wsm" + dataExplorerURL = "https://workbench.verily.com/api/de" + + cmd := exec.Command("wb", "status", "--format=json") + output, err := cmd.CombinedOutput() + if err != nil { + fmt.Fprintf(os.Stderr, "Warning: wb status failed, using default URLs: %v\n", err) + } else { + var status map[string]interface{} + if err := json.Unmarshal(output, &status); err != nil { + fmt.Fprintf(os.Stderr, "Warning: failed to parse wb status JSON, using default URLs: %v\n", err) + } else { + // Extract server URLs + if server, ok := status["server"].(map[string]interface{}); ok { + if wsURL, ok := server["workspaceManagerUri"].(string); ok && wsURL != "" { + workspaceBaseURL = wsURL + dataExplorerURL = strings.Replace(wsURL, "/api/wsm", "/api/de", 1) + } + } else { + fmt.Fprintf(os.Stderr, "Warning: server info not found in wb status, using default URLs\n") + } + // Best-effort workspace UUID cache at startup. If this fails (e.g. auth not + // ready yet), getCurrentWorkspaceUUID() will retry lazily at call time. + if _, startupErr := getCurrentWorkspaceUUID(); startupErr != nil { + fmt.Fprintf(os.Stderr, "Warning: could not resolve workspace UUID at startup (will retry on first use): %v\n", startupErr) + } + } + } + + // Final safety check - ensure URLs are never empty + if workspaceBaseURL == "" { + workspaceBaseURL = "https://workbench.verily.com/api/wsm" + } + if dataExplorerURL == "" { + dataExplorerURL = "https://workbench.verily.com/api/de" + } + + fmt.Fprintf(os.Stderr, "Initialized - Workspace: %s, DataExplorer: %s\n", workspaceBaseURL, dataExplorerURL) + return nil +} + +// resolveWorkspaceId resolves an arbitrary user-facing workspace ID to its UUID +// by searching the full workspace list. Used by tools that accept an explicit +// workspaceId parameter. For the CURRENT workspace, use getCurrentWorkspaceUUID(). +func resolveWorkspaceId(workspaceId string) (string, error) { + if isUUID(workspaceId) { + return workspaceId, nil // already a UUID + } + for _, limit := range []int{100, 5000} { + listUrl := fmt.Sprintf("%s/api/workspaces/v1?offset=0&limit=%d", workspaceBaseURL, limit) + listResp, apiErr := makeAPIRequest("GET", listUrl, nil) + if apiErr != nil { + continue + } + var listData map[string]interface{} + if json.Unmarshal(listResp, &listData) != nil { + continue + } + workspaces, _ := listData["workspaces"].([]interface{}) + for _, ws := range workspaces { + wsMap, ok := ws.(map[string]interface{}) + if !ok { + continue + } + ufid, _ := wsMap["userFacingId"].(string) + id, _ := wsMap["id"].(string) + if ufid == workspaceId || id == workspaceId { + return id, nil + } + } + } + return "", fmt.Errorf("workspace '%s' not found", workspaceId) +} + +// isUUID returns true if s looks like a UUID (8-4-4-4-12 hex format). +func isUUID(s string) bool { + if len(s) != 36 { + return false + } + for i, c := range s { + if i == 8 || i == 13 || i == 18 || i == 23 { + if c != '-' { + return false + } + } else if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { + return false + } + } + return true +} + +// getCurrentWorkspaceUUID returns the UUID of the currently active workspace. +// It uses a three-layer strategy so that temporary failures (auth not ready, +// server startup race) do not permanently break workspace-scoped tools: +// +// 1. Return the cached UUID if already resolved. +// 2. Call `wb workspace describe --format=json` — fast, no list traversal needed. +// If the response contains a `uuid` field, use it directly. +// If not, extract the `id` / `userFacingId` and proceed to layer 3. +// 3. Search the workspace list with a small page (100) first, then full (5000), +// using the userFacingId obtained from layer 2. +// +// The result is cached so subsequent calls within the same server session are instant. +func getCurrentWorkspaceUUID() (string, error) { + if cachedWorkspaceUUID != "" { + return cachedWorkspaceUUID, nil + } + + // Layer 1: wb workspace describe — most direct path. + userFacingId := "" + cmd := exec.Command("wb", "workspace", "describe", "--format=json") + if out, err := cmd.CombinedOutput(); err == nil { + var desc map[string]interface{} + if json.Unmarshal(out, &desc) == nil { + // Some Workbench versions return uuid directly. + if uuid, ok := desc["uuid"].(string); ok && isUUID(uuid) { + cachedWorkspaceUUID = uuid + fmt.Fprintf(os.Stderr, "Resolved workspace UUID from describe: %s\n", uuid) + return uuid, nil + } + // id may be the UUID on some versions, or userFacingId on others. + if id, ok := desc["id"].(string); ok { + if isUUID(id) { + cachedWorkspaceUUID = id + fmt.Fprintf(os.Stderr, "Resolved workspace UUID from describe.id: %s\n", id) + return id, nil + } + userFacingId = id + } + // Explicit userFacingId field takes precedence if present. + if ufid, ok := desc["userFacingId"].(string); ok && ufid != "" { + userFacingId = ufid + } + } + } + + // Layer 2: fall back to wb status for userFacingId if describe didn't give it. + if userFacingId == "" { + cmd2 := exec.Command("wb", "status", "--format=json") + if out, err := cmd2.CombinedOutput(); err == nil { + var status map[string]interface{} + if json.Unmarshal(out, &status) == nil { + if ws, ok := status["workspace"].(map[string]interface{}); ok { + if ufid, ok := ws["userFacingId"].(string); ok && ufid != "" { + userFacingId = ufid + } else if id, ok := ws["id"].(string); ok { + if isUUID(id) { + cachedWorkspaceUUID = id + return id, nil + } + userFacingId = id + } + } + } + } + } + + if userFacingId == "" { + return "", fmt.Errorf("no active workspace found — run `wb workspace set --id=` first") + } + + // Layer 3: resolve userFacingId → UUID via workspace list. + // Try a small page first to avoid fetching 5,000 workspaces for common cases. + for _, limit := range []int{100, 5000} { + listUrl := fmt.Sprintf("%s/api/workspaces/v1?offset=0&limit=%d", workspaceBaseURL, limit) + listResp, apiErr := makeAPIRequest("GET", listUrl, nil) + if apiErr != nil { + continue + } + var listData map[string]interface{} + if json.Unmarshal(listResp, &listData) != nil { + continue + } + workspaces, ok := listData["workspaces"].([]interface{}) + if !ok { + continue + } + for _, w := range workspaces { + wsMap, ok := w.(map[string]interface{}) + if !ok { + continue + } + ufid, _ := wsMap["userFacingId"].(string) + id, _ := wsMap["id"].(string) + if ufid == userFacingId || id == userFacingId { + // id in the workspace list API is always the UUID. + cachedWorkspaceUUID = id + fmt.Fprintf(os.Stderr, "Resolved workspace UUID from list (limit=%d): %s\n", limit, id) + return id, nil + } + } + } + + return "", fmt.Errorf("workspace '%s' not found in accessible workspaces", userFacingId) +} + +func getToken() (string, error) { + cmd := exec.Command("wb", "auth", "print-access-token") + output, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("failed to get access token: %v", err) + } + return strings.TrimSpace(string(output)), nil +} + + +func makeAPIRequest(method, url string, body interface{}) ([]byte, error) { + token, err := getToken() + if err != nil { + return nil, err + } + + var reqBody io.Reader + if body != nil { + jsonData, err := json.Marshal(body) + if err != nil { + return nil, fmt.Errorf("failed to marshal request: %v", err) + } + reqBody = bytes.NewBuffer(jsonData) + } + + req, err := http.NewRequest(method, url, reqBody) + if err != nil { + return nil, err + } + + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("API error (%d): %s", resp.StatusCode, string(respBody)) + } + + return respBody, nil +} + +func executeWbCommand(args []string) (string, error) { + cmd := exec.Command("wb", args...) + output, err := cmd.CombinedOutput() + return string(output), err +} + +func requireString(args map[string]interface{}, key string) (string, error) { + val, ok := args[key] + if !ok || val == nil { + return "", fmt.Errorf("missing required parameter: %s", key) + } + s, ok := val.(string) + if !ok { + return "", fmt.Errorf("parameter %s must be a string, got %T", key, val) + } + return s, nil +} + +func requireStrings(args map[string]interface{}, keys ...string) ([]string, error) { + vals := make([]string, len(keys)) + for i, key := range keys { + v, err := requireString(args, key) + if err != nil { + return nil, err + } + vals[i] = v + } + return vals, nil +} + +func handleCallTool(params CallToolParams) CallToolResult { + var output string + var err error + + switch params.Name { + case "wb_status": + output, err = executeWbCommand([]string{"status"}) + case "wb_workspace_list": + args := []string{"workspace", "list"} + if format, ok := params.Arguments["format"].(string); ok && format == "json" { + args = append(args, "--format=json") + } + output, err = executeWbCommand(args) + case "wb_execute": + command, ok := params.Arguments["command"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'command' required"}}, IsError: true} + } + output, err = executeWbCommand(strings.Fields(command)) + + case "workspace_list_all": + limit, offset := 100, 0 + if l, ok := params.Arguments["limit"].(float64); ok { + limit = int(l) + } + if o, ok := params.Arguments["offset"].(float64); ok { + offset = int(o) + } + body := map[string]interface{}{"limit": limit, "offset": offset} + if props, ok := params.Arguments["properties"].(map[string]interface{}); ok { + // Convert properties from map to array of key-value objects + var propsArray []map[string]string + for key, val := range props { + if strVal, ok := val.(string); ok { + propsArray = append(propsArray, map[string]string{"key": key, "value": strVal}) + } + } + body["properties"] = propsArray + } + respBody, apiErr := makeAPIRequest("POST", workspaceBaseURL+"/api/workspaces/v2/filtered", body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "platform_list_data_collections": + // Fetch all data collections accessible to the user across all workspaces. + // Data collections are workspaces with the property terra-type=data-collection. + limit := 100 + if l, ok := params.Arguments["limit"].(float64); ok { + limit = int(l) + } + query := "" + if q, ok := params.Arguments["query"].(string); ok { + query = strings.ToLower(strings.TrimSpace(q)) + } + + body := map[string]interface{}{ + "limit": limit, + "offset": 0, + "properties": []map[string]string{ + {"key": "terra-type", "value": "data-collection"}, + }, + } + respBody, apiErr := makeAPIRequest("POST", workspaceBaseURL+"/api/workspaces/v2/filtered", body) + if apiErr != nil { + err = apiErr + break + } + + var wsData map[string]interface{} + if jsonErr := json.Unmarshal(respBody, &wsData); jsonErr != nil { + err = fmt.Errorf("failed to parse response: %w", jsonErr) + break + } + + workspaces, _ := wsData["workspaces"].([]interface{}) + if workspaces == nil { + workspaces = []interface{}{} + } + + var collections []map[string]interface{} + for _, w := range workspaces { + ws, ok := w.(map[string]interface{}) + if !ok { + continue + } + + uuid, _ := ws["id"].(string) + userFacingId, _ := ws["userFacingId"].(string) + name, _ := ws["displayName"].(string) + if name == "" { + name = userFacingId + } + desc, _ := ws["description"].(string) + + // Derive the Workbench UI URL for this data collection + // workspaceBaseURL is e.g. https://workbench.verily.com/api/wsm + workbenchBaseUI := strings.Replace(workspaceBaseURL, "/api/wsm", "", 1) + collectionURL := fmt.Sprintf("%s/data-collections/%s", workbenchBaseUI, userFacingId) + + // Extract all terra-* workspace properties into a flat map + props := make(map[string]string) + if propsArray, ok := ws["properties"].([]interface{}); ok { + for _, p := range propsArray { + if prop, ok := p.(map[string]interface{}); ok { + k, _ := prop["key"].(string) + v, _ := prop["value"].(string) + props[k] = v + } + } + } + + // Apply optional keyword filter across name, description, short description, + // modality tags, and therapeutic tags so searches like "genomics" or "imaging" work. + // Props are extracted before the filter so tags are available for matching. + if query != "" { + searchTargets := strings.Join([]string{ + strings.ToLower(name), + strings.ToLower(desc), + strings.ToLower(props["terra-workspace-short-description"]), + strings.ToLower(props["terra-data-modality-tags"]), + strings.ToLower(props["terra-therapeutic-tags"]), + strings.ToLower(props["terra-dc-data-model"]), + }, " ") + if !strings.Contains(searchTargets, query) { + continue + } + } + + // Build structured result with all meaningful metadata fields + dc := map[string]interface{}{ + "id": userFacingId, + "uuid": uuid, + "name": name, + "workbenchUrl": collectionURL, + } + + // Overview + if v := props["terra-workspace-short-description"]; v != "" { + dc["shortDescription"] = v + } + if desc != "" { + dc["description"] = desc + } + if v := props["terra-organization-name"]; v != "" { + dc["organization"] = v + } + if v := props["terra-dc-availability"]; v != "" { + dc["availability"] = v + } + if v := props["terra-dc-is-free"]; v != "" { + dc["isFree"] = v == "true" + } + if v := props["terra-is-instantly-accessible"]; v != "" { + dc["isInstantlyAccessible"] = v == "true" + } + + // Data characteristics + if v := props["terra-dc-patient-count"]; v != "" { + dc["patientCount"] = v + } + if v := props["terra-dc-time-frame"]; v != "" { + dc["timeFrame"] = v + } + if v := props["terra-dc-geographic-coverage"]; v != "" { + dc["geographicCoverage"] = v + } + if v := props["terra-dc-data-model"]; v != "" { + dc["dataModel"] = v + } + if v := props["terra-data-modality-tags"]; v != "" { + dc["dataModalityTags"] = v + } + if v := props["terra-therapeutic-tags"]; v != "" { + dc["therapeuticTags"] = v + } + + // Schema / underlay + if v := props["terra-dx-underlay-name"]; v != "" { + dc["underlayName"] = v + } + + // Data dictionary + if v := props["terra-dc-data-dictionary"]; v != "" { + dc["dataDictionary"] = v + } + + // Usage examples (includes sample queries) + if v := props["terra-dc-usage-examples-sample-use-cases"]; v != "" { + dc["usageExamples"] = v + } + + // Access + if v := props["terra-access-group-name"]; v != "" { + dc["accessGroupName"] = v + } + if v := props["terra-support-email"]; v != "" { + dc["supportEmail"] = v + } + + // Publication / freshness + if v := props["terra-dc-data-published"]; v != "" { + dc["dataPublished"] = v + } + if v := props["terra-dc-metadata-last-updated"]; v != "" { + dc["metadataLastUpdated"] = v + } + + // External documentation + if v := props["terra-dc-external-documentation"]; v != "" { + dc["externalDocumentation"] = v + } + + collections = append(collections, dc) + } + + if collections == nil { + collections = []map[string]interface{}{} + } + + result := map[string]interface{}{ + "dataCollections": collections, + "total": len(collections), + "scope": "platform-wide (all data collections you have READ access to)", + "attachCommand": "wb workspace clone --id= # or ask your workspace admin to attach the collection", + } + resultBytes, marshalErr := json.MarshalIndent(result, "", " ") + if marshalErr != nil { + err = fmt.Errorf("failed to marshal result: %w", marshalErr) + } else { + output = string(resultBytes) + } + + case "workspace_get": + workspaceId, ok := params.Arguments["workspaceId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'workspaceId' required"}}, IsError: true} + } + // Resolve user-facing ID to UUID + workspaceUuid, err := resolveWorkspaceId(workspaceId) + if err != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: err.Error()}}, IsError: true} + } + url := fmt.Sprintf("%s/api/workspaces/v1/%s", workspaceBaseURL, workspaceUuid) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "workspace_list_resources": + workspaceId, ok := params.Arguments["workspaceId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'workspaceId' required"}}, IsError: true} + } + offset := 0 + if val, ok := params.Arguments["offset"].(float64); ok { + offset = int(val) + } + limit := 100 + if val, ok := params.Arguments["limit"].(float64); ok { + limit = int(val) + } + // Resolve user-facing ID to UUID + workspaceUuid, err := resolveWorkspaceId(workspaceId) + if err != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: err.Error()}}, IsError: true} + } + url := fmt.Sprintf("%s/api/workspaces/v1/%s/resources?offset=%d&limit=%d", workspaceBaseURL, workspaceUuid, offset, limit) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "underlay_list": + respBody, apiErr := makeAPIRequest("GET", dataExplorerURL+"/v2/underlays", nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "underlay_get_schema": + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + url := fmt.Sprintf("%s/v2/underlays/%s", dataExplorerURL, underlayName) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "underlay_list_entities": + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + url := fmt.Sprintf("%s/v2/underlays/%s/entities", dataExplorerURL, underlayName) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "underlay_get_entity": + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + entityName, ok := params.Arguments["entityName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'entityName' required"}}, IsError: true} + } + url := fmt.Sprintf("%s/v2/underlays/%s/entities/%s", dataExplorerURL, underlayName, entityName) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "underlay_list_criteria_selectors": + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + // Get the schema + url := fmt.Sprintf("%s/v2/underlays/%s", dataExplorerURL, underlayName) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + break + } + + // Parse the schema + var schema map[string]interface{} + if err := json.Unmarshal(respBody, &schema); err != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Error parsing schema: %v", err)}}, IsError: true} + } + + // Extract criteria selectors from serializedConfiguration + serializedConfig, ok := schema["serializedConfiguration"].(map[string]interface{}) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: serializedConfiguration not found"}}, IsError: true} + } + + criteriaSelectorsRaw, ok := serializedConfig["criteriaSelectors"].([]interface{}) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: criteriaSelectors not found"}}, IsError: true} + } + + // Parse each selector (they are JSON strings) + var selectors []map[string]interface{} + for _, selectorRaw := range criteriaSelectorsRaw { + selectorStr, ok := selectorRaw.(string) + if !ok { + continue + } + var selector map[string]interface{} + if err := json.Unmarshal([]byte(selectorStr), &selector); err != nil { + continue + } + + // Extract useful fields for agents + result := map[string]interface{}{ + "name": selector["name"], + "displayName": selector["displayName"], + "plugin": selector["plugin"], + } + + if pluginConfig, ok := selector["pluginConfig"].(string); ok { + result["pluginConfig"] = pluginConfig + } + + if display, ok := selector["display"].(map[string]interface{}); ok { + if category, ok := display["category"].(string); ok { + result["category"] = category + } + } + + selectors = append(selectors, result) + } + + outputBytes, _ := json.MarshalIndent(map[string]interface{}{"selectors": selectors}, "", " ") + output = string(outputBytes) + + case "data_query_hints": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + entityName, ok := params.Arguments["entityName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'entityName' required"}}, IsError: true} + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/entities/%s/hints", dataExplorerURL, studyId, cohortId, entityName) + respBody, apiErr := makeAPIRequest("POST", url, map[string]interface{}{}) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "data_sample_instances": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + entityName, ok := params.Arguments["entityName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'entityName' required"}}, IsError: true} + } + body := map[string]interface{}{"limit": 50} + if attrs, ok := params.Arguments["includeAttributes"].([]interface{}); ok { + body["includeAttributes"] = attrs + } + if filter, ok := params.Arguments["filter"].(map[string]interface{}); ok { + body["filter"] = filter + } + if limit, ok := params.Arguments["limit"].(float64); ok { + body["limit"] = int(limit) + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/entities/%s/instances", dataExplorerURL, studyId, cohortId, entityName) + respBody, apiErr := makeAPIRequest("POST", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "study_list": + offset, limit := 0, 50 + if o, ok := params.Arguments["offset"].(float64); ok { + offset = int(o) + } + if l, ok := params.Arguments["limit"].(float64); ok { + limit = int(l) + } + url := fmt.Sprintf("%s/v2/studies?offset=%d&limit=%d", dataExplorerURL, offset, limit) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "study_list_cohorts": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + offset, limit := 0, 50 + if o, ok := params.Arguments["offset"].(float64); ok { + offset = int(o) + } + if l, ok := params.Arguments["limit"].(float64); ok { + limit = int(l) + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts?offset=%d&limit=%d", dataExplorerURL, studyId, offset, limit) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "cohort_create_in_workspace": + workspaceId, ok := params.Arguments["workspaceId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'workspaceId' required"}}, IsError: true} + } + _, ok = params.Arguments["underlayId"].(string) // underlayId kept for validation but not used + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayId' required"}}, IsError: true} + } + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + name, ok := params.Arguments["name"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'name' required"}}, IsError: true} + } + displayName := name + if dn, ok := params.Arguments["displayName"].(string); ok { + displayName = dn + } + description := "" + if desc, ok := params.Arguments["description"].(string); ok { + description = desc + } + + // Step 1: Create cohort in Data Explorer + createBody := map[string]interface{}{ + "studyCreateInfo": map[string]interface{}{ + "displayName": displayName + " Study", + }, + "cohortCreateInfo": map[string]interface{}{ + "underlayName": underlayName, + "displayName": displayName, + "description": description, + }, + } + createResp, apiErr := makeAPIRequest("POST", dataExplorerURL+"/v2/createCohortInStudy", createBody) + if apiErr != nil { + err = fmt.Errorf("Step 1 failed (create cohort): %w", apiErr) + break + } + + // Parse response to get studyId and cohortId + var createResult map[string]interface{} + if err := json.Unmarshal(createResp, &createResult); err != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Error parsing create response: %v", err)}}, IsError: true} + } + study, _ := createResult["study"].(map[string]interface{}) + cohort, _ := createResult["cohort"].(map[string]interface{}) + studyId, _ := study["id"].(string) + cohortId, _ := cohort["id"].(string) + + // Step 2: Update criteria if provided + if criteriaJson, ok := params.Arguments["criteriaJson"].(string); ok && criteriaJson != "" { + var updateBody interface{} + if unmarshalErr := json.Unmarshal([]byte(criteriaJson), &updateBody); unmarshalErr != nil { + err = fmt.Errorf("Step 2 failed (parse criteria): %w", unmarshalErr) + break + } + _, apiErr = makeAPIRequest("PATCH", fmt.Sprintf("%s/v2/studies/%s/cohorts/%s", dataExplorerURL, studyId, cohortId), updateBody) + if apiErr != nil { + err = fmt.Errorf("Step 2 failed (update criteria): %w", apiErr) + break + } + } + + // Step 3: Save cohort to workspace + // Resolve user-facing ID to UUID + workspaceUuid, err := resolveWorkspaceId(workspaceId) + if err != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Step 3 failed: %v", err)}}, IsError: true} + } + + saveBody := map[string]interface{}{ + "common": map[string]interface{}{ + "displayName": displayName, + "description": description, + "accessScope": "SHARED_ACCESS", + "managedBy": "USER", + "cloningInstructions": "COPY_RESOURCE", + }, + "dataExplorerCohort": map[string]interface{}{ + "studyId": studyId, + "cohortId": cohortId, + }, + } + if folderId, ok := params.Arguments["folderId"].(string); ok { + saveBody["common"].(map[string]interface{})["folderId"] = folderId + } + saveUrl := fmt.Sprintf("%s/api/workspaces/v1/%s/resources/controlled/data-explorer/cohort/save", workspaceBaseURL, workspaceUuid) + respBody, apiErr := makeAPIRequest("POST", saveUrl, saveBody) + if apiErr != nil { + err = fmt.Errorf("Step 3 failed (save to workspace): %w", apiErr) + } else { + // Parse workspace response and add studyId/cohortId at top level for easy extraction + var workspaceResp map[string]interface{} + if err := json.Unmarshal(respBody, &workspaceResp); err == nil { + workspaceResp["studyId"] = studyId + workspaceResp["cohortId"] = cohortId + if modifiedResp, err := json.Marshal(workspaceResp); err == nil { + output = string(modifiedResp) + } else { + output = string(respBody) + } + } else { + output = string(respBody) + } + } + + case "cohort_update_criteria": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + body := map[string]interface{}{} + if criteria, ok := params.Arguments["criteriaGroupSections"]; ok { + body["criteriaGroupSections"] = criteria + } + if displayName, ok := params.Arguments["displayName"].(string); ok { + body["displayName"] = displayName + } + if description, ok := params.Arguments["description"].(string); ok { + body["description"] = description + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s", dataExplorerURL, studyId, cohortId) + respBody, apiErr := makeAPIRequest("PATCH", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "cohort_count_instances": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + body := map[string]interface{}{"groupByAttributes": []string{}} + if entity, ok := params.Arguments["entity"].(string); ok { + body["entity"] = entity + } + if attrs, ok := params.Arguments["groupByAttributes"].([]interface{}); ok { + body["groupByAttributes"] = attrs + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/counts", dataExplorerURL, studyId, cohortId) + respBody, apiErr := makeAPIRequest("POST", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "export_list_models": + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + url := fmt.Sprintf("%s/v2/underlays/%s/exportModels", dataExplorerURL, underlayName) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "export_describe": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + body := map[string]interface{}{} + if allCriteria, ok := params.Arguments["allCriteriaFromCohort"].(bool); ok { + body["allCriteriaFromCohort"] = allCriteria + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/describeExport", dataExplorerURL, studyId, cohortId) + respBody, apiErr := makeAPIRequest("POST", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "export_preview": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + body := map[string]interface{}{} + if exportModel, ok := params.Arguments["exportModel"].(string); ok { + body["exportModel"] = exportModel + } + if entityName, ok := params.Arguments["entityName"].(string); ok { + body["entityName"] = entityName + } + if limit, ok := params.Arguments["limit"].(float64); ok { + body["limit"] = int(limit) + } else { + body["limit"] = 20 + } + if inputs, ok := params.Arguments["inputs"].(map[string]interface{}); ok { + body["inputs"] = inputs + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/previewExport", dataExplorerURL, studyId, cohortId) + respBody, apiErr := makeAPIRequest("POST", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "export_cohort": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + exportRequests, ok := params.Arguments["exportRequests"].([]interface{}) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'exportRequests' required"}}, IsError: true} + } + body := map[string]interface{}{ + "exportRequests": exportRequests, + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/export", dataExplorerURL, studyId, cohortId) + respBody, apiErr := makeAPIRequest("POST", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "filter_build_attribute": + attribute, ok := params.Arguments["attribute"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'attribute' required"}}, IsError: true} + } + operator, ok := params.Arguments["operator"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'operator' required"}}, IsError: true} + } + dataType, ok := params.Arguments["dataType"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'dataType' required"}}, IsError: true} + } + filter := map[string]interface{}{ + "filterType": "ATTRIBUTE", + "filterUnion": map[string]interface{}{ + "attributeFilter": map[string]interface{}{ + "attribute": attribute, + "operator": operator, + }, + }, + } + if operator != "IS_NULL" && operator != "IS_NOT_NULL" { + values := []interface{}{} + if val, ok := params.Arguments["value"]; ok { + values = append(values, buildLiteral(dataType, val)) + } + if vals, ok := params.Arguments["values"].([]interface{}); ok { + for _, v := range vals { + values = append(values, buildLiteral(dataType, v)) + } + } + filter["filterUnion"].(map[string]interface{})["attributeFilter"].(map[string]interface{})["values"] = values + } + outputBytes, _ := json.MarshalIndent(filter, "", " ") + output = string(outputBytes) + + case "filter_build_relationship": + relatedEntity, ok := params.Arguments["relatedEntity"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'relatedEntity' required"}}, IsError: true} + } + filter := map[string]interface{}{ + "filterType": "RELATIONSHIP", + "filterUnion": map[string]interface{}{ + "relationshipFilter": map[string]interface{}{ + "entity": relatedEntity, + }, + }, + } + if subfilter, ok := params.Arguments["subfilter"].(map[string]interface{}); ok { + filter["filterUnion"].(map[string]interface{})["relationshipFilter"].(map[string]interface{})["subfilter"] = subfilter + } + outputBytes, _ := json.MarshalIndent(filter, "", " ") + output = string(outputBytes) + + case "filter_build_boolean_logic": + operator, ok := params.Arguments["operator"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'operator' required"}}, IsError: true} + } + subfilters, ok := params.Arguments["subfilters"].([]interface{}) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'subfilters' required"}}, IsError: true} + } + filter := map[string]interface{}{ + "filterType": "BOOLEAN_LOGIC", + "filterUnion": map[string]interface{}{ + "booleanLogicFilter": map[string]interface{}{ + "operator": operator, + "subfilters": subfilters, + }, + }, + } + outputBytes, _ := json.MarshalIndent(filter, "", " ") + output = string(outputBytes) + + case "filter_build_hierarchy": + hierarchy, ok := params.Arguments["hierarchy"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'hierarchy' required"}}, IsError: true} + } + operator, ok := params.Arguments["operator"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'operator' required"}}, IsError: true} + } + filter := map[string]interface{}{ + "filterType": "HIERARCHY", + "filterUnion": map[string]interface{}{ + "hierarchyFilter": map[string]interface{}{ + "hierarchy": hierarchy, + "operator": operator, + }, + }, + } + if values, ok := params.Arguments["values"].([]interface{}); ok { + filter["filterUnion"].(map[string]interface{})["hierarchyFilter"].(map[string]interface{})["values"] = values + } + outputBytes, _ := json.MarshalIndent(filter, "", " ") + output = string(outputBytes) + + case "workspace_create": + vals, reqErr := requireStrings(params.Arguments, "id", "podId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + id, podId := vals[0], vals[1] + args := []string{"workspace", "create", "--id=" + id, "--pod=" + podId} + if name, ok := params.Arguments["name"].(string); ok { + args = append(args, "--name="+name) + } + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } + if orgId, ok := params.Arguments["organizationId"].(string); ok { + args = append(args, "--org="+orgId) + } + output, err = executeWbCommand(args) + + case "workspace_delete": + workspaceId, reqErr := requireString(params.Arguments, "workspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workspace", "delete", "--workspace=" + workspaceId}) + + case "workspace_update": + workspaceId, reqErr := requireString(params.Arguments, "workspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + args := []string{"workspace", "update", "--workspace=" + workspaceId} + if name, ok := params.Arguments["name"].(string); ok { + args = append(args, "--name="+name) + } + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } + output, err = executeWbCommand(args) + + case "workspace_duplicate": + vals, reqErr := requireStrings(params.Arguments, "sourceWorkspaceId", "destWorkspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + sourceId, destId := vals[0], vals[1] + args := []string{"workspace", "duplicate", "--source-workspace=" + sourceId, "--destination-workspace-id=" + destId} + if name, ok := params.Arguments["name"].(string); ok { + args = append(args, "--name="+name) + } + output, err = executeWbCommand(args) + + case "workspace_set_property": + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "key", "value") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workspace", "set-property", "--workspace=" + vals[0], "--key=" + vals[1], "--value=" + vals[2]}) + + case "workspace_delete_property": + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "key") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workspace", "delete-property", "--workspace=" + vals[0], "--key=" + vals[1]}) + + case "workspace_add_user": + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "email", "role") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workspace", "add-user", "--workspace=" + vals[0], "--email=" + vals[1], "--role=" + vals[2]}) + + case "workspace_remove_user": + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "email") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workspace", "remove-user", "--workspace=" + vals[0], "--email=" + vals[1]}) + + case "workspace_list_users": + workspaceId, reqErr := requireString(params.Arguments, "workspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workspace", "list-users", "--workspace=" + workspaceId}) + + case "resource_create_bucket": + vals, reqErr := requireStrings(params.Arguments, "resourceId", "bucketName") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + resourceId, bucketName := vals[0], vals[1] + args := []string{"resource", "create", "gcs-bucket", "--id=" + resourceId, "--bucket-name=" + bucketName} + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } + output, err = executeWbCommand(args) + + case "resource_create_bq_dataset": + vals, reqErr := requireStrings(params.Arguments, "resourceId", "datasetId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + resourceId, datasetId := vals[0], vals[1] + args := []string{"resource", "create", "bq-dataset", "--id=" + resourceId, "--dataset-id=" + datasetId} + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } + output, err = executeWbCommand(args) + + case "resource_delete": + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"resource", "delete", "--name=" + resourceId}) + + case "resource_update": + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + args := []string{"resource", "update", "--name=" + resourceId} + if name, ok := params.Arguments["name"].(string); ok { + args = append(args, "--new-name="+name) + } + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } + output, err = executeWbCommand(args) + + case "resource_add_reference": + vals, reqErr := requireStrings(params.Arguments, "resourceId", "resourceType", "path") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + resourceId, resourceType, path := vals[0], vals[1], vals[2] + args := []string{"resource", "add-ref", resourceType, "--name=" + resourceId, "--path=" + path} + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } + output, err = executeWbCommand(args) + + case "resource_check_access": + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"resource", "check-access", "--name=" + resourceId}) + + case "resource_move": + vals, reqErr := requireStrings(params.Arguments, "resourceId", "folderId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"resource", "move", "--name=" + vals[0], "--folder-id=" + vals[1]}) + + case "folder_create": + vals, reqErr := requireStrings(params.Arguments, "folderId", "displayName") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + folderId, displayName := vals[0], vals[1] + args := []string{"folder", "create", "--id=" + folderId, "--display-name=" + displayName} + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } + if parentId, ok := params.Arguments["parentId"].(string); ok { + args = append(args, "--parent-folder-id="+parentId) + } + output, err = executeWbCommand(args) + + case "folder_delete": + folderId, reqErr := requireString(params.Arguments, "folderId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"folder", "delete", "--id=" + folderId}) + + case "folder_update": + folderId, reqErr := requireString(params.Arguments, "folderId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + args := []string{"folder", "update", "--id=" + folderId} + if displayName, ok := params.Arguments["displayName"].(string); ok { + args = append(args, "--display-name="+displayName) + } + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } + output, err = executeWbCommand(args) + + case "folder_list_tree": + output, err = executeWbCommand([]string{"folder", "tree"}) + + case "workspace_list_data_collections": + var workspaceUuid string + var uuidErr error + workspaceUuid, uuidErr = getCurrentWorkspaceUUID() + if uuidErr != nil { + output = fmt.Sprintf("Could not determine active workspace: %v\n\nTo fix: run `wb workspace set --id=` in your terminal, then retry.", uuidErr) + break + } + + // List all resources (same API call as workspace_list_resources which works) + resourcesUrl := fmt.Sprintf("%s/api/workspaces/v1/%s/resources?offset=0&limit=1000", workspaceBaseURL, workspaceUuid) + resourcesResp, apiErr := makeAPIRequest("GET", resourcesUrl, nil) + if apiErr != nil { + err = fmt.Errorf("failed to list resources via API: %w", apiErr) + break + } + + // Parse resources list + var resourcesData map[string]interface{} + if jsonErr := json.Unmarshal(resourcesResp, &resourcesData); jsonErr != nil { + err = fmt.Errorf("failed to parse resources: %w", jsonErr) + break + } + resourcesList, ok := resourcesData["resources"].([]interface{}) + if !ok { + resourcesList = []interface{}{} + } + + // Build a UUID → display name map with a single batch API call. + // This avoids the N+1 sequential lookups (one per collection) that caused timeouts. + collectionNames := make(map[string]string) // uuid → display name + batchBody := map[string]interface{}{ + "limit": 1000, + "offset": 0, + "properties": []map[string]string{ + {"key": "terra-type", "value": "data-collection"}, + }, + } + if batchResp, batchErr := makeAPIRequest("POST", workspaceBaseURL+"/api/workspaces/v2/filtered", batchBody); batchErr == nil { + var batchData map[string]interface{} + if json.Unmarshal(batchResp, &batchData) == nil { + if wsList, ok := batchData["workspaces"].([]interface{}); ok { + for _, w := range wsList { + ws, ok := w.(map[string]interface{}) + if !ok { + continue + } + uuid, _ := ws["id"].(string) + displayName, _ := ws["displayName"].(string) + if displayName == "" { + displayName, _ = ws["userFacingId"].(string) + } + if uuid != "" && displayName != "" { + collectionNames[uuid] = displayName + } + } + } + } + } + // Fall back gracefully: if the batch call fails, groups will be keyed by UUID + + // Group resources by data collection, using display name where available + dataCollections := make(map[string]map[string]interface{}) + localResources := []map[string]interface{}{} + + for _, r := range resourcesList { + resource, ok := r.(map[string]interface{}) + if !ok { + continue + } + metadata, _ := resource["metadata"].(map[string]interface{}) + if metadata == nil { + continue + } + + resourceInfo := map[string]interface{}{} + + // Extract name and type from metadata + if name, ok := metadata["name"].(string); ok { + resourceInfo["name"] = name + } + if resType, ok := metadata["resourceType"].(string); ok { + resourceInfo["type"] = resType + } + // GCS bucket path + if bucket, ok := metadata["bucketName"].(string); ok { + resourceInfo["path"] = "gs://" + bucket + } else if gcsBucket, ok := metadata["gcsBucketName"].(string); ok { + resourceInfo["path"] = "gs://" + gcsBucket + } + // BigQuery dataset path + if dataset, ok := metadata["datasetId"].(string); ok { + if project, ok := metadata["projectId"].(string); ok { + resourceInfo["path"] = project + ":" + dataset + } + } + + // Check resourceLineage array for source workspace ID + var sourceId string + if lineageArray, ok := metadata["resourceLineage"].([]interface{}); ok && len(lineageArray) > 0 { + if firstLineage, ok := lineageArray[0].(map[string]interface{}); ok { + if sid, ok := firstLineage["sourceWorkspaceId"].(string); ok && sid != "" { + sourceId = sid + } + } + } + + // Group by display name (falling back to UUID if name not resolved) + if sourceId != "" { + groupKey := collectionNames[sourceId] + if groupKey == "" { + groupKey = sourceId + } + if dataCollections[groupKey] == nil { + dataCollections[groupKey] = map[string]interface{}{ + "sourceWorkspaceId": sourceId, + "resources": []map[string]interface{}{}, + } + } + resList := dataCollections[groupKey]["resources"].([]map[string]interface{}) + dataCollections[groupKey]["resources"] = append(resList, resourceInfo) + } else { + localResources = append(localResources, resourceInfo) + } + } + + // Count resources in collections + resourcesInCollections := 0 + for _, dc := range dataCollections { + if res, ok := dc["resources"].([]map[string]interface{}); ok { + resourcesInCollections += len(res) + } + } + + // Build output + result := map[string]interface{}{ + "dataCollections": dataCollections, + "localResources": localResources, + "summary": map[string]interface{}{ + "totalDataCollections": len(dataCollections), + "totalResources": len(resourcesList), + "resourcesFromCollections": resourcesInCollections, + "resourcesCreatedLocally": len(localResources), + }, + } + + outputBytes, _ := json.MarshalIndent(result, "", " ") + output = string(outputBytes) + + case "group_create": + vals, reqErr := requireStrings(params.Arguments, "groupId", "name") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + groupId, name := vals[0], vals[1] + args := []string{"group", "create", "--id=" + groupId, "--name=" + name} + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } + output, err = executeWbCommand(args) + + case "group_delete": + groupId, reqErr := requireString(params.Arguments, "groupId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"group", "delete", "--id=" + groupId}) + + case "group_list": + output, err = executeWbCommand([]string{"group", "list"}) + + case "group_describe": + groupId, reqErr := requireString(params.Arguments, "groupId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"group", "describe", "--id=" + groupId}) + + case "group_add_user": + vals, reqErr := requireStrings(params.Arguments, "groupId", "email", "role") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"group", "member", "add", "--group-id=" + vals[0], "--email=" + vals[1], "--role=" + vals[2]}) + + case "group_remove_user": + vals, reqErr := requireStrings(params.Arguments, "groupId", "email") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"group", "member", "remove", "--group-id=" + vals[0], "--email=" + vals[1]}) + + case "app_create": + vals, reqErr := requireStrings(params.Arguments, "appId", "appConfig") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + appId, appConfig := vals[0], vals[1] + args := []string{"app", "create", "gcp", "--id=" + appId, "--config=" + appConfig} + if machineType, ok := params.Arguments["machineType"].(string); ok { + args = append(args, "--machine-type="+machineType) + } + if description, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+description) + } + if location, ok := params.Arguments["location"].(string); ok { + args = append(args, "--location="+location) + } + output, err = executeWbCommand(args) + + case "app_delete": + appId, reqErr := requireString(params.Arguments, "appId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"app", "delete", "--id=" + appId, "--quiet"}) + + case "app_list": + output, err = executeWbCommand([]string{"app", "list"}) + + case "app_start": + appId, reqErr := requireString(params.Arguments, "appId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"app", "start", "--id=" + appId}) + + case "app_stop": + appId, reqErr := requireString(params.Arguments, "appId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"app", "stop", "--id=" + appId}) + + case "app_get_url": + appId, reqErr := requireString(params.Arguments, "appId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"app", "launch", "--id=" + appId}) + + case "auth_status": + output, err = executeWbCommand([]string{"auth", "status"}) + + case "server_list": + output, err = executeWbCommand([]string{"server", "list"}) + + case "server_set": + serverName, reqErr := requireString(params.Arguments, "serverName") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"server", "set", "--name=" + serverName}) + + case "server_status": + output, err = executeWbCommand([]string{"server", "status"}) + + case "server_list_regions": + cloudPlatform, reqErr := requireString(params.Arguments, "cloudPlatform") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"server", "list-regions", "--platform=" + cloudPlatform}) + + case "pod_list": + output, err = executeWbCommand([]string{"pod", "list"}) + + case "pod_describe": + podId, reqErr := requireString(params.Arguments, "podId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"pod", "describe", "--id=" + podId}) + + case "pod_role_list": + vals, reqErr := requireStrings(params.Arguments, "organizationId", "podId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"pod", "role", "list", "--organization=" + vals[0], "--pod=" + vals[1]}) + + case "pod_role_grant": + vals, reqErr := requireStrings(params.Arguments, "organizationId", "podId", "email", "role") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"pod", "role", "grant", "user", "--organization=" + vals[0], "--pod=" + vals[1], "--email=" + vals[2], "--role=" + vals[3]}) + + case "pod_role_revoke": + vals, reqErr := requireStrings(params.Arguments, "organizationId", "podId", "email", "role") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"pod", "role", "revoke", "user", "--organization=" + vals[0], "--pod=" + vals[1], "--email=" + vals[2], "--role=" + vals[3]}) + + case "organization_list": + output, err = executeWbCommand([]string{"organization", "list"}) + + case "resource_credentials": + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + args := []string{"resource", "credentials", "--name=" + resourceId} + if duration, ok := params.Arguments["duration"].(float64); ok { + args = append(args, fmt.Sprintf("--duration=%d", int(duration))) + } + output, err = executeWbCommand(args) + + case "resource_open_console": + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"resource", "open-console", "--name=" + resourceId}) + + case "resource_list_tree": + output, err = executeWbCommand([]string{"resource", "list-tree"}) + + case "resource_mount": + output, err = executeWbCommand([]string{"resource", "mount"}) + + case "resource_unmount": + output, err = executeWbCommand([]string{"resource", "unmount"}) + + case "notebook_start": + notebookId, reqErr := requireString(params.Arguments, "notebookId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"notebook", "start", "--id=" + notebookId}) + + case "notebook_stop": + notebookId, reqErr := requireString(params.Arguments, "notebookId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"notebook", "stop", "--id=" + notebookId}) + + case "notebook_launch": + notebookId, reqErr := requireString(params.Arguments, "notebookId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"notebook", "launch", "--id=" + notebookId}) + + case "cluster_start": + clusterId, reqErr := requireString(params.Arguments, "clusterId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"cluster", "start", "--id=" + clusterId}) + + case "cluster_stop": + clusterId, reqErr := requireString(params.Arguments, "clusterId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"cluster", "stop", "--id=" + clusterId}) + + case "cluster_launch": + clusterId, reqErr := requireString(params.Arguments, "clusterId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"cluster", "launch", "--id=" + clusterId}) + + case "workflow_list": + workspaceId, reqErr := requireString(params.Arguments, "workspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workflow", "list", "--workspace=" + workspaceId}) + + case "workflow_create": + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "workflowId", "bucketId", "path") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + workspaceId, workflowId, bucketId, path := vals[0], vals[1], vals[2], vals[3] + args := []string{"workflow", "create", "--workspace=" + workspaceId, "--workflow=" + workflowId, "--bucket-id=" + bucketId, "--path=" + path} + if displayName, ok := params.Arguments["displayName"].(string); ok { + args = append(args, "--display-name="+displayName) + } + if description, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+description) + } + output, err = executeWbCommand(args) + + case "workflow_describe": + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "workflowId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workflow", "describe", "--workspace=" + vals[0], "--workflow=" + vals[1]}) + + case "workflow_job_list": + output, err = executeWbCommand([]string{"workflow", "job", "list"}) + + case "workflow_job_describe": + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "jobId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workflow", "job", "describe", "--workspace=" + vals[0], "--job-id=" + vals[1]}) + + case "workflow_job_run": + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "workflowId", "outputBucketId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + workspaceId, workflowId, outputBucketId := vals[0], vals[1], vals[2] + args := []string{"workflow", "job", "run", "--workspace=" + workspaceId, "--workflow=" + workflowId, "--output-bucket-id=" + outputBucketId} + if jobId, ok := params.Arguments["jobId"].(string); ok { + args = append(args, "--job-id="+jobId) + } + if description, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+description) + } + if outputPath, ok := params.Arguments["outputPath"].(string); ok { + args = append(args, "--output-path="+outputPath) + } + if inputs, ok := params.Arguments["inputs"].(map[string]interface{}); ok { + inputsJSON, _ := json.Marshal(inputs) + args = append(args, "--inputs="+string(inputsJSON)) + } + output, err = executeWbCommand(args) + + case "workflow_job_cancel": + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "jobId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workflow", "job", "cancel", "--workspace=" + vals[0], "--job-id=" + vals[1]}) + + case "cromwell_generate_config": + path, reqErr := requireString(params.Arguments, "path") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"cromwell", "generate-config", "--path=" + path}) + + case "workspace_configure_aws": + workspaceId, reqErr := requireString(params.Arguments, "workspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workspace", "configure-aws", "--workspace=" + workspaceId}) + + case "resolve": + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"resolve", "--name=" + resourceId}) + + case "version": + output, err = executeWbCommand([]string{"version"}) + + case "bq_execute": + command, reqErr := requireString(params.Arguments, "command") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand(append([]string{"bq"}, strings.Fields(command)...)) + + case "gcloud_execute": + command, reqErr := requireString(params.Arguments, "command") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand(append([]string{"gcloud"}, strings.Fields(command)...)) + + case "gsutil_execute": + command, reqErr := requireString(params.Arguments, "command") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand(append([]string{"gsutil"}, strings.Fields(command)...)) + + case "git_execute": + command, reqErr := requireString(params.Arguments, "command") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand(append([]string{"git"}, strings.Fields(command)...)) + + default: + return CallToolResult{Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Unknown tool: %s", params.Name)}}, IsError: true} + } + + if err != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Error: %s", err.Error())}}, IsError: true} + } + return CallToolResult{Content: []ContentItem{{Type: "text", Text: output}}, IsError: false} +} + +func buildLiteral(dataType string, value interface{}) map[string]interface{} { + literal := map[string]interface{}{"dataType": dataType, "valueUnion": map[string]interface{}{}} + switch dataType { + case "BOOLEAN": + literal["valueUnion"].(map[string]interface{})["boolVal"] = value + case "INT64": + literal["valueUnion"].(map[string]interface{})["int64Val"] = fmt.Sprintf("%v", value) + case "STRING": + literal["valueUnion"].(map[string]interface{})["stringVal"] = fmt.Sprintf("%v", value) + case "DATE": + literal["valueUnion"].(map[string]interface{})["dateVal"] = fmt.Sprintf("%v", value) + case "TIMESTAMP": + literal["valueUnion"].(map[string]interface{})["timestampVal"] = fmt.Sprintf("%v", value) + case "DOUBLE": + literal["valueUnion"].(map[string]interface{})["doubleVal"] = value + } + return literal +} + +func handleRequest(req JSONRPCRequest) JSONRPCResponse { + switch req.Method { + case "initialize": + return JSONRPCResponse{ + JSONRPC: "2.0", + ID: req.ID, + Result: InitializeResult{ + ProtocolVersion: "2024-11-05", + Capabilities: map[string]interface{}{"tools": map[string]interface{}{}}, + ServerInfo: ServerInfo{Name: "wb-mcp-server", Version: "2.0.0"}, + }, + } + case "notifications/initialized": + // Client sends this notification after receiving initialize response + // No response needed for notifications + return JSONRPCResponse{} + case "tools/list": + return JSONRPCResponse{JSONRPC: "2.0", ID: req.ID, Result: ListToolsResult{Tools: wbTools}} + case "tools/call": + var params CallToolParams + if err := json.Unmarshal(req.Params, ¶ms); err != nil { + return JSONRPCResponse{JSONRPC: "2.0", ID: req.ID, Error: &RPCError{Code: -32602, Message: "Invalid params"}} + } + return JSONRPCResponse{JSONRPC: "2.0", ID: req.ID, Result: handleCallTool(params)} + default: + return JSONRPCResponse{JSONRPC: "2.0", ID: req.ID, Error: &RPCError{Code: -32601, Message: "Method not found"}} + } +} + +// HTTP handler for MCP requests +func handleHTTP(w http.ResponseWriter, r *http.Request) { + // Set CORS headers for local access + w.Header().Set("Access-Control-Allow-Origin", "http://127.0.0.1") + w.Header().Set("Access-Control-Allow-Methods", "POST, OPTIONS") + w.Header().Set("Access-Control-Allow-Headers", "Content-Type") + + // Handle preflight + if r.Method == http.MethodOptions { + w.WriteHeader(http.StatusOK) + return + } + + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + var req JSONRPCRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, "Invalid JSON-RPC request", http.StatusBadRequest) + return + } + + response := handleRequest(req) + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(response) +} + +// Run server in HTTP mode +func runHTTPServer(port string) { + http.HandleFunc("/", handleHTTP) + + addr := "127.0.0.1:" + port + log.Printf("Starting HTTP MCP server on %s (port arg: %q)\n", addr, port) + log.Printf("Ready - %d tools available\n", len(wbTools)) + + log.Printf("About to call ListenAndServe with addr: %q\n", addr) + if err := http.ListenAndServe(addr, nil); err != nil { + log.Fatalf("HTTP server failed: %v", err) + } +} + +// Run server in stdio mode +func runStdioServer() { + log.Println("Starting stdio MCP server") + log.Printf("Ready - %d tools available\n", len(wbTools)) + + scanner := bufio.NewScanner(os.Stdin) + for scanner.Scan() { + line := scanner.Text() + if line == "" { + continue + } + + var req JSONRPCRequest + if err := json.Unmarshal([]byte(line), &req); err != nil { + continue + } + + response := handleRequest(req) + // Only send response if there's a result or error (skip empty responses for notifications) + if response.Result != nil || response.Error != nil { + responseBytes, _ := json.Marshal(response) + fmt.Println(string(responseBytes)) + } + } +} + +func main() { + var httpMode bool + var port string + + flag.BoolVar(&httpMode, "http", false, "Run in HTTP mode instead of stdio") + flag.StringVar(&port, "port", "9242", "Port for HTTP server") + flag.Parse() + + log.SetOutput(os.Stderr) + log.Println("Workbench MCP Server v2.0 starting...") + + if err := initializeConfig(); err != nil { + log.Fatalf("Error initializing: %v\n", err) + } + + if httpMode { + runHTTPServer(port) + } else { + runStdioServer() + } +} diff --git a/src/vscode-with-llm/.devcontainer.json b/src/vscode-with-llm/.devcontainer.json new file mode 100644 index 000000000..aac38e47f --- /dev/null +++ b/src/vscode-with-llm/.devcontainer.json @@ -0,0 +1,71 @@ +{ + "name": "vscode with LLM tools", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": + "./startupscript/post-startup.sh abc /config \"${templateOption:cloud}\" \"${templateOption:login}\"; ./sudo-passwordless.sh abc", + // re-mount bucket files on container start up, then generate LLM context + "postStartCommand": "./startupscript/remount-on-restart.sh abc /config \"${templateOption:cloud}\" \"${templateOption:login}\"; /opt/llm-context/run-context-generator.sh /config || true", + "features": { + "ghcr.io/devcontainers/features/java@sha256:9663ce0219ff85786e87901ce5f0a59f488edd5f99b46015192cda48468b233a": { + "version": "17" + }, + "ghcr.io/devcontainers/features/node@sha256:8c0de46939b61958041700ee89e3493f3b2e4131a06dc46b4d9423427d06e5f6": { + "version": "24.11.0" + }, + "ghcr.io/devcontainers/features/aws-cli@sha256:1f93c8315b7a6d76982ebb2269f8b0d50413fc0f965c032edf4aee0caceb73ef": {}, + "ghcr.io/dhoeric/features/google-cloud-cli@sha256:fa5d894718825c5ad8009ac8f2c9f0cea3d1661eb108a9d465cba9f3fc48965f": {}, + "ghcr.io/anthropics/devcontainer-features/claude-code@sha256:cfc2e7d3e9fd3b9b01f8d5cb158508a884c8c0ede2e23ed10f32dea5d4ffe69a": {}, + "./.devcontainer/features/gemini-cli": { "username": "abc" }, + "./.devcontainer/features/workbench-tools": { + "cloud": "${templateOption:cloud}", + "username": "abc", + "userHomeDir": "/config" + }, + "./.devcontainer/features/postgres-client": { + "version": "16" + }, + "./.devcontainer/features/wb-mcp-server": { + "username": "abc", + "userHomeDir": "/config" + }, + "./.devcontainer/features/llm-context": { + "username": "abc", + "userHomeDir": "/config" + } + }, + "remoteUser": "root", + "customizations": { + "workbench": { + "opens": { + "extensions": [ + // Source. + ".c", + ".cjs", + ".cpp", + ".go", + ".java", + ".js", + ".mjs", + ".php", + ".scala", + ".sh", + ".ts", + // Documents + ".md", + ".html", + // Data + ".csv", + ".json", + ".jsonc", + ".tsv", + ".xml", + ".yml" + ], + "fileUrlSuffix": "?payload=[[\"openFile\",\"vscode-remote:///config/{path}\"]]" + } + } + } +} diff --git a/src/vscode-with-llm/Dockerfile b/src/vscode-with-llm/Dockerfile new file mode 100644 index 000000000..eed456d8a --- /dev/null +++ b/src/vscode-with-llm/Dockerfile @@ -0,0 +1,28 @@ +FROM lscr.io/linuxserver/code-server@sha256:7bd334657f13505abc1e20afeeee5670ad8f818e68853c810889184e597f3051 + +# Gemini: https://open-vsx.org/extension/Google/geminicodeassist +# Claude: https://open-vsx.org/extension/Anthropic/claude-code +RUN apt-get update \ + && apt-get install -y --no-install-recommends jq \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /config/extensions \ + && chown abc:abc /config /config/extensions + +USER abc +ENV HOME=/config + +RUN curl -fsSL 'https://open-vsx.org/api/Google/geminicodeassist/2.79.0' \ + | jq -r '.files.download' \ + | xargs curl -fL --compressed -o /tmp/geminicodeassist.vsix \ + && curl -fsSL 'https://open-vsx.org/api/Anthropic/claude-code/linux-x64/2.1.128' \ + | jq -r '.files.download' \ + | xargs curl -fL --compressed -o /tmp/claudecode.vsix \ + && /app/code-server/bin/code-server --extensions-dir /config/extensions --install-extension /tmp/geminicodeassist.vsix \ + && /app/code-server/bin/code-server --extensions-dir /config/extensions --install-extension /tmp/claudecode.vsix \ + && rm /tmp/geminicodeassist.vsix /tmp/claudecode.vsix \ + && mkdir -p /config/data/User \ + && echo '{"http.systemCertificatesNode":true}' > /config/data/User/settings.json + +USER root + +WORKDIR /config diff --git a/src/vscode-with-llm/README.md b/src/vscode-with-llm/README.md new file mode 100644 index 000000000..b8403896e --- /dev/null +++ b/src/vscode-with-llm/README.md @@ -0,0 +1,17 @@ + +# Vscode (vscode) + +A Template to run vscode on workbench + +## Options + +| Options Id | Description | Type | Default Value | +|-----|-----|-----|-----| +| cloud | VM cloud environment | string | gcp | +| login | Whether to log in to workbench CLI | string | false | + + + +--- + +_Note: This file was auto-generated from the [devcontainer-template.json](https://github.com/verily-src/workbench-app-devcontainers/blob/main/src/vscode/devcontainer-template.json). Add additional notes to a `NOTES.md`._ diff --git a/src/vscode-with-llm/devcontainer-template.json b/src/vscode-with-llm/devcontainer-template.json new file mode 100644 index 000000000..233ce2c34 --- /dev/null +++ b/src/vscode-with-llm/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "vscode-with-llm", + "version": "0.0.1", + "name": "Vscode with LLM tools", + "description": "A Template to run vscode with LLM tools on workbench", + "documentationURL": "https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/vscode-with-llm", + "licenseURL": "https://github.com/verily-src/workbench-app-devcontainers/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/src/vscode-with-llm/docker-compose.yaml b/src/vscode-with-llm/docker-compose.yaml new file mode 100644 index 000000000..01f6af644 --- /dev/null +++ b/src/vscode-with-llm/docker-compose.yaml @@ -0,0 +1,30 @@ +version: "2.4" +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + volumes: + - .:/workspace:cached + - work:/home/vscode:cached + ports: + - "8443:8443" + environment: + USER: "abc" + DEFAULT_WORKSPACE: "/config" + SUDO_PASSWORD: "pwd" + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined +networks: + app-network: + external: true +volumes: + work: diff --git a/src/vscode-with-llm/sudo-passwordless.sh b/src/vscode-with-llm/sudo-passwordless.sh new file mode 100755 index 000000000..d14bd0a17 --- /dev/null +++ b/src/vscode-with-llm/sudo-passwordless.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# This script is used to set up passwordless sudo for the core user on the VM. +# It requires to be run with root priviledges and USER_NAME to be set in the environment. +# It is typically called from post-startup.sh. + +USER_NAME="${1}" + +if [[ -z "${USER_NAME}" ]]; then + echo "Usage: $0 " + exit 1 +fi + +sudoers_file="/etc/sudoers" +sudoers_d_file="/etc/sudoers.d/${USER_NAME}" + +# Make sure user exists +if ! id "${USER_NAME}" &>/dev/null; then + echo "User ${USER_NAME} does not exist." + exit 1 +fi + +# Check if there's an old rule in the main sudoers file that requires a password +if grep -q "^${USER_NAME} ALL=(ALL:ALL) ALL" "${sudoers_file}"; then + echo "Found password-requiring rule for ${USER_NAME} in /etc/sudoers. Commenting it out." + + # Comment out the old rule in /etc/sudoers + sed -i "s/^${USER_NAME} ALL=(ALL:ALL) ALL/# ${USER_NAME} ALL=(ALL:ALL) ALL/" "${sudoers_file}" +fi + +echo "${USER_NAME} ALL=(ALL) NOPASSWD:ALL" > "${sudoers_d_file}" +chmod 440 "${sudoers_d_file}" + +echo "User ${USER_NAME} has been given passwordless sudo access." diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json new file mode 100644 index 000000000..9502d3ce4 --- /dev/null +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -0,0 +1,73 @@ +{ + "name": "Workbench Jupyter with LLM tools", + "dockerComposeFile": ["docker-compose.yaml", "../jupyter-common/jupyter-common-compose.yaml"], + "service": "app", + "runServices": ["app"], + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": [ + "./startupscript/post-startup.sh", + "jupyter", + "/home/jupyter", + "${templateOption:cloud}", + "${templateOption:login}" + ], + // re-mount bucket files on container start up, then generate LLM context + "postStartCommand": "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\"; /opt/llm-context/run-context-generator.sh /home/jupyter || true", + "features": { + "ghcr.io/devcontainers/features/node@sha256:8c0de46939b61958041700ee89e3493f3b2e4131a06dc46b4d9423427d06e5f6": { + "version": "24.11.0" + }, + "ghcr.io/anthropics/devcontainer-features/claude-code@sha256:cfc2e7d3e9fd3b9b01f8d5cb158508a884c8c0ede2e23ed10f32dea5d4ffe69a": {}, + "./.devcontainer/features/gemini-cli": { "username": "jupyter" }, + "./.devcontainer/features/workbench-tools": { + "libEnv": "/opt/conda/envs/jupyter", // Use the jupyter conda environment + "cloud": "${templateOption:cloud}", + "username": "jupyter", + "userHomeDir": "/home/jupyter" + }, + "./.devcontainer/features/postgres-client": { + "version": "16" + }, + "./.devcontainer/features/wb-mcp-server": { + "username": "jupyter", + "userHomeDir": "/home/jupyter" + }, + "./.devcontainer/features/llm-context": { + "username": "jupyter", + "userHomeDir": "/home/jupyter" + } + }, + "remoteUser": "root", + "customizations": { + "workbench": { + "opens": { + "extensions": [ + // Source + ".ipynb", + ".R", + ".py", + // Documents + ".md", + ".html", + ".latex", + ".pdf", + // Images + ".bmp", + ".gif", + ".jpeg", + ".jpg", + ".png", + ".svg", + // Data + ".csv", + ".tsv", + ".json", + ".vl" + ], + "fileUrlSuffix": "/lab/tree/{path}", + "folderUrlSuffix": "/lab/tree/{path}" + } + } + } +} diff --git a/src/workbench-jupyter-with-llm/Dockerfile b/src/workbench-jupyter-with-llm/Dockerfile new file mode 100644 index 000000000..3db1ef60b --- /dev/null +++ b/src/workbench-jupyter-with-llm/Dockerfile @@ -0,0 +1,5 @@ +FROM us-west2-docker.pkg.dev/shared-pub-buckets-94mvrf/workbench-artifacts/app-workbench-jupyter@sha256:62089d6cef2015e08361928c6bb6ae003afd0800a3e682a536171b3bcb0765b1 + +# Install jupyter extensions +RUN --mount=type=bind,from=jupyter-extension-builder,source=/dist,target=/tmp/extensions \ + /tmp/extensions/setup.sh diff --git a/src/workbench-jupyter-with-llm/README.md b/src/workbench-jupyter-with-llm/README.md new file mode 100644 index 000000000..31b8b6272 --- /dev/null +++ b/src/workbench-jupyter-with-llm/README.md @@ -0,0 +1,50 @@ + +# Workbench Jupyter with LLM tools + +Workbench JupyterLab with integrated AI assistance through Gemini CLI, Claude CLI, and MCP server support for enhanced development capabilities. + +## Options + +| Options Id | Description | Type | Default Value | +|-----|-----|-----|-----| +| cloud | VM cloud environment | string | gcp | +| login | Whether to log in to workbench CLI | string | false | + + + +## Features + +This template includes the following integrated features: + +- **Workbench Tools** - Common bioinformatics and genomics tools +- **Gemini CLI** - Google Gemini AI assistant with MCP support +- **Claude CLI** - Anthropic Claude AI assistant (from ghcr.io/anthropics/devcontainer-features/claude-code:1.0) +- **WB MCP Server** - Workbench Model Context Protocol server for AI tool integration with workspace context +- **LLM Context Generator** - Devcontainer feature that auto-generates `~/CLAUDE.md` with workspace context for Claude Code + +All AI assistants are pre-configured to work with the Workbench MCP server for enhanced workspace awareness. + +## LLM Context + +On startup, the app automatically generates a `~/CLAUDE.md` file that provides Claude Code with: + +- Current workspace information (name, ID, cloud platform, your role) +- Resource paths and environment variables +- Data exploration commands and best practices +- Links to skill files for detailed guidance + +Claude Code automatically discovers `~/CLAUDE.md` on startup, giving it immediate context about your Workbench environment. + +### Refreshing Context + +If you add or remove resources, refresh the context: + +```bash +refresh-context +# or +generate-llm-context +``` + +--- + +_Note: This file was auto-generated from the [devcontainer-template.json](devcontainer-template.json). Add additional notes to a `NOTES.md`._ diff --git a/src/workbench-jupyter-with-llm/devcontainer-template.json b/src/workbench-jupyter-with-llm/devcontainer-template.json new file mode 100644 index 000000000..02ee914fe --- /dev/null +++ b/src/workbench-jupyter-with-llm/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "workbench-jupyter-with-llm", + "description": "Workbench JupyterLab with Gemini, Claude CLI, and MCP server integration", + "version": "0.0.1", + "name": "Workbench Jupyter with LLM tools", + "documentationURL": "https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/workbench-jupyter-with-llm", + "licenseURL": "https://github.com/verily-src/workbench-app-devcontainers/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/src/workbench-jupyter-with-llm/docker-compose.yaml b/src/workbench-jupyter-with-llm/docker-compose.yaml new file mode 100644 index 000000000..18f191280 --- /dev/null +++ b/src/workbench-jupyter-with-llm/docker-compose.yaml @@ -0,0 +1,26 @@ +include: + - ../jupyter-common/jupyter-common-compose.yaml +services: + app: + container_name: "application-server" + build: + context: . + additional_contexts: + jupyter-extension-builder: service:jupyter-common-extension-builder + user: "jupyter" + restart: always + volumes: + - .:/workspace:cached + ports: + - "8888:8888" + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined +networks: + app-network: + external: true diff --git a/tests/common/llm-tools.bats b/tests/common/llm-tools.bats new file mode 100644 index 000000000..7f170cd4b --- /dev/null +++ b/tests/common/llm-tools.bats @@ -0,0 +1,23 @@ +setup_file() { + echo "# Running ${BATS_TEST_FILENAME##*/}" >&3 +} + +setup() { + load common +} + +@test "node" { + run_in_container node --version +} + +@test "npm" { + run_in_container npm --version +} + +@test "claude" { + run_in_container claude --version +} + +@test "gemini" { + run_in_container gemini --version +} diff --git a/tests/workbench-jupyter-with-llm.sh b/tests/workbench-jupyter-with-llm.sh new file mode 100755 index 000000000..0e8d7e6b5 --- /dev/null +++ b/tests/workbench-jupyter-with-llm.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -o errexit +export TEST_USER="jupyter" + +bats tests/common/base.bats +bats tests/common/workbench-tools.bats +bats tests/common/postgres-client.bats +bats tests/common/llm-tools.bats