From 4c8b6dacd0fe1cf1e32bc941ef1890555a03a8a7 Mon Sep 17 00:00:00 2001 From: Navid Zolghadr Date: Fri, 19 Dec 2025 11:23:08 -0500 Subject: [PATCH 01/86] Test MCP server --- features/src/wb-mcp-server/README.md | 170 ++++++++ .../wb-mcp-server/devcontainer-feature.json | 27 ++ features/src/wb-mcp-server/go.mod | 3 + features/src/wb-mcp-server/install.sh | 166 ++++++++ features/src/wb-mcp-server/main.go | 391 ++++++++++++++++++ 5 files changed, 757 insertions(+) create mode 100644 features/src/wb-mcp-server/README.md create mode 100644 features/src/wb-mcp-server/devcontainer-feature.json create mode 100644 features/src/wb-mcp-server/go.mod create mode 100755 features/src/wb-mcp-server/install.sh create mode 100644 features/src/wb-mcp-server/main.go diff --git a/features/src/wb-mcp-server/README.md b/features/src/wb-mcp-server/README.md new file mode 100644 index 000000000..db437a6ab --- /dev/null +++ b/features/src/wb-mcp-server/README.md @@ -0,0 +1,170 @@ +# Workbench MCP Server Feature + +This dev container feature installs a local MCP (Model Context Protocol) server that wraps the Workbench CLI (`wb`), enabling AI assistants like Claude and Gemini to interact with your Workbench environment. + +## What is MCP? + +The Model Context Protocol (MCP) is an open standard developed by Anthropic that allows AI assistants to securely connect to external tools and data sources. This feature creates a local MCP server that exposes Workbench CLI functionality to AI assistants. + +## Features + +- **Zero Authentication**: The MCP server runs locally and doesn't require authentication (assumes `wb` is already authenticated) +- **Full wb CLI Access**: Exposes common Workbench operations as MCP tools +- **Easy Integration**: Works with Claude CLI, Gemini CLI, and any other MCP-compatible client +- **Standalone Binary**: Compiled Go binary with no runtime dependencies + +## Installation + +Add this feature to your `.devcontainer.json`: + +```json +{ + "features": { + "ghcr.io/verily-src/workbench-app-devcontainers/wb-mcp-server:1": { + "username": "vscode", + "userHomeDir": "/home/vscode" + } + } +} +``` + +Or use it locally: + +```json +{ + "features": { + "./features/src/wb-mcp-server": { + "username": "vscode", + "userHomeDir": "/home/vscode" + } + } +} +``` + +## Available Tools + +The MCP server exposes the following tools: + +1. **wb_status** - Get current workspace and server status +2. **wb_workspace_list** - List all Workbench workspaces +3. **wb_resource_list** - List resources in the current workspace +4. **wb_resource_describe** - Describe a specific resource +5. **wb_folder_tree** - Display folder structure as a tree +6. **wb_app_list** - List all applications in the workspace +7. **wb_app_describe** - Describe a specific application +8. **wb_execute** - Execute any custom wb command + +## Usage with Claude CLI + +1. After the feature is installed, the MCP server binary is available at `/opt/wb-mcp-server/wb-mcp-server` + +2. Configure Claude CLI by editing `~/.config/claude/config.json`: + +```json +{ + "mcpServers": { + "wb": { + "command": "/opt/wb-mcp-server/wb-mcp-server" + } + } +} +``` + +3. Start Claude CLI, and it will automatically connect to the MCP server: + +```bash +claude +``` + +4. Claude can now interact with your Workbench environment: + +``` +You: List all my workspaces +Claude: [calls wb_workspace_list tool] Here are your workspaces... + +You: What resources are in my current workspace? +Claude: [calls wb_resource_list tool] Here are the resources... +``` + +## Usage with Gemini CLI + +Configure the Gemini CLI similarly by adding the MCP server to its configuration file. + +## Usage with Other MCP Clients + +The server implements the standard MCP protocol over stdio. Any MCP-compatible client can connect by running: + +```bash +/opt/wb-mcp-server/wb-mcp-server +``` + +The server reads JSON-RPC requests from stdin and writes responses to stdout. + +## Manual Testing + +You can test the server manually: + +```bash +echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"test","version":"1.0"}}}' | /opt/wb-mcp-server/wb-mcp-server +``` + +## Environment Variables + +After installation, these environment variables are available: + +- `WB_MCP_SERVER_BIN`: Path to the MCP server binary +- `WB_MCP_CONFIG`: Path to the example MCP configuration file + +## Prerequisites + +- The `wb` CLI must be installed and authenticated in the container +- The user running the MCP server must have access to `wb` commands + +## Options + +- **username** (default: `root`): Container user that will run the MCP server +- **userHomeDir** (default: `/root`): Home directory of the container user +- **port** (default: `3000`): Reserved for future use (currently stdio-based) + +## Security Notes + +- This MCP server is designed for **local use only** +- It does not implement authentication (relies on local `wb` authentication) +- Do not expose this server to untrusted networks +- The server runs with the same permissions as the user who starts it + +## Troubleshooting + +### Server not found + +Make sure the feature is installed and the binary exists: + +```bash +ls -l /opt/wb-mcp-server/wb-mcp-server +``` + +### wb command not found + +Ensure the Workbench CLI is installed and authenticated: + +```bash +wb status +``` + +### Permission denied + +Check that the binary is executable: + +```bash +chmod +x /opt/wb-mcp-server/wb-mcp-server +``` + +## Development + +The MCP server is written in Go and built during feature installation. Source code: + +- `main.go`: Server implementation +- `go.mod`: Go module definition +- `install.sh`: Installation script + +To modify the server, edit `main.go` and rebuild your dev container. diff --git a/features/src/wb-mcp-server/devcontainer-feature.json b/features/src/wb-mcp-server/devcontainer-feature.json new file mode 100644 index 000000000..ea17f8721 --- /dev/null +++ b/features/src/wb-mcp-server/devcontainer-feature.json @@ -0,0 +1,27 @@ +{ + "id": "wb-mcp-server", + "version": "1.0.0", + "name": "Workbench MCP Server", + "description": "Installs a local MCP (Model Context Protocol) server that wraps the wb CLI, enabling AI assistants to interact with Workbench. The server runs locally without authentication and can be used with Claude CLI, Gemini CLI, or other MCP clients.", + "options": { + "username": { + "type": "string", + "default": "root", + "description": "Username of the container user." + }, + "userHomeDir": { + "type": "string", + "default": "/root", + "description": "Home directory of the container user." + }, + "port": { + "type": "string", + "default": "3000", + "description": "Port for the MCP server to listen on." + } + }, + "installsAfter": [ + "ghcr.io/devcontainers/features/common-utils", + "ghcr.io/devcontainers/features/go" + ] +} diff --git a/features/src/wb-mcp-server/go.mod b/features/src/wb-mcp-server/go.mod new file mode 100644 index 000000000..62d3c9cbb --- /dev/null +++ b/features/src/wb-mcp-server/go.mod @@ -0,0 +1,3 @@ +module github.com/verily-src/wb-mcp-server + +go 1.21 diff --git a/features/src/wb-mcp-server/install.sh b/features/src/wb-mcp-server/install.sh new file mode 100755 index 000000000..46a059e2f --- /dev/null +++ b/features/src/wb-mcp-server/install.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash + +# install.sh installs the Workbench MCP server in the devcontainer. + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +readonly USERNAME="${USERNAME:-"root"}" +USER_HOME_DIR="${USERHOMEDIR:-"/home/${USERNAME}"}" +if [[ "${USER_HOME_DIR}" == "/home/root" ]]; then + USER_HOME_DIR="/root" +fi +readonly USER_HOME_DIR + +readonly PORT="${PORT:-"3000"}" + +export DEBIAN_FRONTEND=noninteractive +export TZ=Etc/UTC + +WORKDIR="$(mktemp -d)" +readonly WORKDIR + +readonly WB_MCP_DIR="/opt/wb-mcp-server" +readonly WB_MCP_BIN="${WB_MCP_DIR}/wb-mcp-server" + +function cleanup() { + rm -rf "${WORKDIR:?}" + rm -rf /var/lib/apt/lists/* +} + +trap 'cleanup' EXIT + +function apt_get_update() { + if [ "$(find /var/lib/apt/lists/* | wc -l)" = "0" ]; then + echo "Running apt-get update..." + apt-get update -y + fi +} + +# Checks if packages are installed and installs them if not +function check_packages() { + if ! dpkg -s "$@" > /dev/null 2>&1; then + apt_get_update + apt-get -y install --no-install-recommends "$@" + fi +} + +echo "Starting wb-mcp-server installation..." + +# Save the directory where the feature files are located +FEATURE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly FEATURE_DIR + +if ! type apt-get &>/dev/null; then + echo "Error: unable to find a supported package manager." + exit 1 +fi + +# Install required packages +check_packages \ + ca-certificates \ + curl \ + git + +# Check if Go is installed +if ! command -v go &> /dev/null; then + echo "Go is not installed. Installing Go 1.21..." + GOLANG_VERSION="1.21.6" + GOLANG_ARCH="amd64" + + cd "${WORKDIR}" + curl -fsSL "https://go.dev/dl/go${GOLANG_VERSION}.linux-${GOLANG_ARCH}.tar.gz" -o go.tar.gz + tar -C /usr/local -xzf go.tar.gz + export PATH="/usr/local/go/bin:${PATH}" +fi + +# Create installation directory +mkdir -p "${WB_MCP_DIR}" + +# Copy source files to temporary build directory +BUILD_DIR="${WORKDIR}/wb-mcp-server" +mkdir -p "${BUILD_DIR}" +cp "${FEATURE_DIR}/main.go" "${BUILD_DIR}/" +cp "${FEATURE_DIR}/go.mod" "${BUILD_DIR}/" + +# Build the Go binary +cd "${BUILD_DIR}" +go build -o "${WB_MCP_BIN}" main.go + +# Make it executable +chmod +x "${WB_MCP_BIN}" + +# Create systemd service file for optional automatic startup +cat > "${WB_MCP_DIR}/wb-mcp-server.service" < "${WB_MCP_DIR}/start-server.sh" <<'EOF' +#!/bin/bash +# Helper script to start the wb-mcp-server +exec /opt/wb-mcp-server/wb-mcp-server +EOF + +chmod +x "${WB_MCP_DIR}/start-server.sh" + +# Create MCP configuration file for easy client setup +cat > "${WB_MCP_DIR}/mcp-config.json" <> "${USER_HOME_DIR}/.bashrc" + +# Make sure the login user is the owner of their .bashrc +chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" + +echo "" +echo "==========================================" +echo "wb-mcp-server installation complete!" +echo "==========================================" +echo "" +echo "The MCP server binary is installed at: ${WB_MCP_BIN}" +echo "Configuration file: ${WB_MCP_DIR}/mcp-config.json" +echo "" +echo "To use with Claude CLI, add this to your Claude config:" +echo " \"wb\": {" +echo " \"command\": \"${WB_MCP_BIN}\"" +echo " }" +echo "" +echo "To start the server manually: ${WB_MCP_DIR}/start-server.sh" +echo "==========================================" +echo "" + +echo "Done!" diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go new file mode 100644 index 000000000..8080bee4b --- /dev/null +++ b/features/src/wb-mcp-server/main.go @@ -0,0 +1,391 @@ +package main + +import ( + "bufio" + "encoding/json" + "fmt" + "os" + "os/exec" + "strings" +) + +// MCP Protocol structures +type JSONRPCRequest struct { + JSONRPC string `json:"jsonrpc"` + ID interface{} `json:"id,omitempty"` + Method string `json:"method"` + Params json.RawMessage `json:"params,omitempty"` +} + +type JSONRPCResponse struct { + JSONRPC string `json:"jsonrpc"` + ID interface{} `json:"id,omitempty"` + Result interface{} `json:"result,omitempty"` + Error *RPCError `json:"error,omitempty"` +} + +type RPCError struct { + Code int `json:"code"` + Message string `json:"message"` +} + +type InitializeParams struct { + ProtocolVersion string `json:"protocolVersion"` + Capabilities map[string]interface{} `json:"capabilities"` + ClientInfo ClientInfo `json:"clientInfo"` +} + +type ClientInfo struct { + Name string `json:"name"` + Version string `json:"version"` +} + +type ServerInfo struct { + Name string `json:"name"` + Version string `json:"version"` +} + +type InitializeResult struct { + ProtocolVersion string `json:"protocolVersion"` + Capabilities map[string]interface{} `json:"capabilities"` + ServerInfo ServerInfo `json:"serverInfo"` +} + +type ListToolsResult struct { + Tools []Tool `json:"tools"` +} + +type Tool struct { + Name string `json:"name"` + Description string `json:"description"` + InputSchema InputSchema `json:"inputSchema"` +} + +type InputSchema struct { + Type string `json:"type"` + Properties map[string]interface{} `json:"properties"` + Required []string `json:"required,omitempty"` +} + +type CallToolParams struct { + Name string `json:"name"` + Arguments map[string]interface{} `json:"arguments,omitempty"` +} + +type CallToolResult struct { + Content []ContentItem `json:"content"` + IsError bool `json:"isError,omitempty"` +} + +type ContentItem struct { + Type string `json:"type"` + Text string `json:"text"` +} + +var wbTools = []Tool{ + { + Name: "wb_status", + Description: "Get the current workspace and server status", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "wb_workspace_list", + Description: "List all Workbench workspaces", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "format": map[string]interface{}{ + "type": "string", + "description": "Output format (json, text)", + "enum": []string{"json", "text"}, + }, + }, + }, + }, + { + Name: "wb_resource_list", + Description: "List resources in the current workspace", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "type": map[string]interface{}{ + "type": "string", + "description": "Resource type to filter by", + }, + "format": map[string]interface{}{ + "type": "string", + "description": "Output format (json, text)", + "enum": []string{"json", "text"}, + }, + }, + }, + }, + { + Name: "wb_resource_describe", + Description: "Describe a specific resource in the workspace", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "name": map[string]interface{}{ + "type": "string", + "description": "Name or ID of the resource", + }, + "format": map[string]interface{}{ + "type": "string", + "description": "Output format (json, text)", + "enum": []string{"json", "text"}, + }, + }, + Required: []string{"name"}, + }, + }, + { + Name: "wb_folder_tree", + Description: "Display folder structure as a tree", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "wb_app_list", + Description: "List all applications in the workspace", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "format": map[string]interface{}{ + "type": "string", + "description": "Output format (json, text)", + "enum": []string{"json", "text"}, + }, + }, + }, + }, + { + Name: "wb_app_describe", + Description: "Describe a specific application", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "name": map[string]interface{}{ + "type": "string", + "description": "Name or ID of the application", + }, + }, + Required: []string{"name"}, + }, + }, + { + Name: "wb_execute", + Description: "Execute a custom wb command. Use this for commands not covered by other tools. Provide the full command without 'wb' prefix (e.g., 'workspace describe --id=123' not 'wb workspace describe --id=123')", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "command": map[string]interface{}{ + "type": "string", + "description": "The wb command to execute (without 'wb' prefix)", + }, + }, + Required: []string{"command"}, + }, + }, +} + +func executeWbCommand(args []string) (string, error) { + cmd := exec.Command("wb", args...) + output, err := cmd.CombinedOutput() + return string(output), err +} + +func handleCallTool(params CallToolParams) CallToolResult { + var args []string + var output string + var err error + + switch params.Name { + case "wb_status": + args = []string{"status"} + output, err = executeWbCommand(args) + + case "wb_workspace_list": + args = []string{"workspace", "list"} + if format, ok := params.Arguments["format"].(string); ok && format == "json" { + args = append(args, "--format=json") + } + output, err = executeWbCommand(args) + + case "wb_resource_list": + args = []string{"resource", "list"} + if resourceType, ok := params.Arguments["type"].(string); ok && resourceType != "" { + args = append(args, "--type="+resourceType) + } + if format, ok := params.Arguments["format"].(string); ok && format == "json" { + args = append(args, "--format=json") + } + output, err = executeWbCommand(args) + + case "wb_resource_describe": + name, ok := params.Arguments["name"].(string) + if !ok { + return CallToolResult{ + Content: []ContentItem{{Type: "text", Text: "Error: 'name' parameter is required"}}, + IsError: true, + } + } + args = []string{"resource", "describe", "--name=" + name} + if format, ok := params.Arguments["format"].(string); ok && format == "json" { + args = append(args, "--format=json") + } + output, err = executeWbCommand(args) + + case "wb_folder_tree": + args = []string{"folder", "tree"} + output, err = executeWbCommand(args) + + case "wb_app_list": + args = []string{"app", "list"} + if format, ok := params.Arguments["format"].(string); ok && format == "json" { + args = append(args, "--format=json") + } + output, err = executeWbCommand(args) + + case "wb_app_describe": + name, ok := params.Arguments["name"].(string) + if !ok { + return CallToolResult{ + Content: []ContentItem{{Type: "text", Text: "Error: 'name' parameter is required"}}, + IsError: true, + } + } + args = []string{"app", "describe", "--name=" + name} + output, err = executeWbCommand(args) + + case "wb_execute": + command, ok := params.Arguments["command"].(string) + if !ok { + return CallToolResult{ + Content: []ContentItem{{Type: "text", Text: "Error: 'command' parameter is required"}}, + IsError: true, + } + } + args = strings.Fields(command) + output, err = executeWbCommand(args) + + default: + return CallToolResult{ + Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Unknown tool: %s", params.Name)}}, + IsError: true, + } + } + + if err != nil { + return CallToolResult{ + Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Command failed: %s\nOutput: %s", err.Error(), output)}}, + IsError: true, + } + } + + return CallToolResult{ + Content: []ContentItem{{Type: "text", Text: output}}, + IsError: false, + } +} + +func handleRequest(req JSONRPCRequest) JSONRPCResponse { + switch req.Method { + case "initialize": + var params InitializeParams + if req.Params != nil { + json.Unmarshal(req.Params, ¶ms) + } + + return JSONRPCResponse{ + JSONRPC: "2.0", + ID: req.ID, + Result: InitializeResult{ + ProtocolVersion: "2024-11-05", + Capabilities: map[string]interface{}{ + "tools": map[string]interface{}{}, + }, + ServerInfo: ServerInfo{ + Name: "wb-mcp-server", + Version: "1.0.0", + }, + }, + } + + case "tools/list": + return JSONRPCResponse{ + JSONRPC: "2.0", + ID: req.ID, + Result: ListToolsResult{ + Tools: wbTools, + }, + } + + case "tools/call": + var params CallToolParams + if err := json.Unmarshal(req.Params, ¶ms); err != nil { + return JSONRPCResponse{ + JSONRPC: "2.0", + ID: req.ID, + Error: &RPCError{ + Code: -32602, + Message: "Invalid params: " + err.Error(), + }, + } + } + + result := handleCallTool(params) + return JSONRPCResponse{ + JSONRPC: "2.0", + ID: req.ID, + Result: result, + } + + default: + return JSONRPCResponse{ + JSONRPC: "2.0", + ID: req.ID, + Error: &RPCError{ + Code: -32601, + Message: "Method not found: " + req.Method, + }, + } + } +} + +func main() { + fmt.Fprintln(os.Stderr, "Workbench MCP Server v1.0.0 starting...") + fmt.Fprintln(os.Stderr, "Reading from stdin, writing to stdout") + + scanner := bufio.NewScanner(os.Stdin) + for scanner.Scan() { + line := scanner.Text() + if line == "" { + continue + } + + var req JSONRPCRequest + if err := json.Unmarshal([]byte(line), &req); err != nil { + fmt.Fprintln(os.Stderr, "Error parsing request:", err) + continue + } + + response := handleRequest(req) + responseBytes, err := json.Marshal(response) + if err != nil { + fmt.Fprintln(os.Stderr, "Error marshaling response:", err) + continue + } + + fmt.Println(string(responseBytes)) + } + + if err := scanner.Err(); err != nil { + fmt.Fprintln(os.Stderr, "Error reading input:", err) + } +} From 57751765d2502cd8ff50e236191f461e7970e9ad Mon Sep 17 00:00:00 2001 From: Navid Zolghadr Date: Tue, 6 Jan 2026 17:44:18 -0500 Subject: [PATCH 02/86] Update wb mcp docs --- features/src/wb-mcp-server/README.md | 51 +++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/features/src/wb-mcp-server/README.md b/features/src/wb-mcp-server/README.md index db437a6ab..23cc9a769 100644 --- a/features/src/wb-mcp-server/README.md +++ b/features/src/wb-mcp-server/README.md @@ -58,25 +58,32 @@ The MCP server exposes the following tools: 1. After the feature is installed, the MCP server binary is available at `/opt/wb-mcp-server/wb-mcp-server` -2. Configure Claude CLI by editing `~/.config/claude/config.json`: +2. Add the MCP server using the Claude CLI command: + +```bash +claude mcp add --transport stdio wb -- /opt/wb-mcp-server/wb-mcp-server +``` + +Alternatively, manually configure by editing `~/.claude.json` or `.mcp.json`: ```json { "mcpServers": { "wb": { + "type": "stdio", "command": "/opt/wb-mcp-server/wb-mcp-server" } } } ``` -3. Start Claude CLI, and it will automatically connect to the MCP server: +3. Verify the server is configured: ```bash -claude +claude mcp list ``` -4. Claude can now interact with your Workbench environment: +4. Start Claude, and it will automatically connect to the MCP server. Claude can now interact with your Workbench environment: ``` You: List all my workspaces @@ -88,7 +95,41 @@ Claude: [calls wb_resource_list tool] Here are the resources... ## Usage with Gemini CLI -Configure the Gemini CLI similarly by adding the MCP server to its configuration file. +1. After the feature is installed, the MCP server binary is available at `/opt/wb-mcp-server/wb-mcp-server` + +2. Add the MCP server using the Gemini CLI command: + +```bash +gemini mcp add --scope user wb /opt/wb-mcp-server/wb-mcp-server +``` + +Alternatively, manually configure by editing `~/.gemini/settings.json` or `.gemini/settings.json`: + +```json +{ + "mcpServers": { + "wb": { + "command": "/opt/wb-mcp-server/wb-mcp-server" + } + } +} +``` + +3. Verify the server is configured: + +```bash +gemini mcp list +``` + +4. Start Gemini, and it will automatically connect to the MCP server. Gemini can now interact with your Workbench environment: + +``` +You: List all my workspaces +Gemini: [calls wb_workspace_list tool] Here are your workspaces... + +You: What resources are in my current workspace? +Gemini: [calls wb_resource_list tool] Here are the resources... +``` ## Usage with Other MCP Clients From ee6600222c716881f6d6bf41ebe2158003dad23f Mon Sep 17 00:00:00 2001 From: Navid Zolghadr Date: Mon, 12 Jan 2026 18:05:02 -0500 Subject: [PATCH 03/86] Add more MCP tools --- features/src/wb-mcp-server/README.md | 218 +- features/src/wb-mcp-server/main.go | 2739 ++++++++++++++++++++++++-- 2 files changed, 2608 insertions(+), 349 deletions(-) diff --git a/features/src/wb-mcp-server/README.md b/features/src/wb-mcp-server/README.md index 23cc9a769..410226b04 100644 --- a/features/src/wb-mcp-server/README.md +++ b/features/src/wb-mcp-server/README.md @@ -1,211 +1,131 @@ -# Workbench MCP Server Feature +# Workbench MCP Server -This dev container feature installs a local MCP (Model Context Protocol) server that wraps the Workbench CLI (`wb`), enabling AI assistants like Claude and Gemini to interact with your Workbench environment. - -## What is MCP? - -The Model Context Protocol (MCP) is an open standard developed by Anthropic that allows AI assistants to securely connect to external tools and data sources. This feature creates a local MCP server that exposes Workbench CLI functionality to AI assistants. - -## Features - -- **Zero Authentication**: The MCP server runs locally and doesn't require authentication (assumes `wb` is already authenticated) -- **Full wb CLI Access**: Exposes common Workbench operations as MCP tools -- **Easy Integration**: Works with Claude CLI, Gemini CLI, and any other MCP-compatible client -- **Standalone Binary**: Compiled Go binary with no runtime dependencies +MCP server that exposes Workbench APIs for AI agents to discover data, explore schemas, and build cohorts programmatically. ## Installation -Add this feature to your `.devcontainer.json`: +Add to your `devcontainer.json`: ```json { "features": { - "ghcr.io/verily-src/workbench-app-devcontainers/wb-mcp-server:1": { - "username": "vscode", - "userHomeDir": "/home/vscode" - } + "ghcr.io/verily-src/workbench-app-devcontainers/wb-mcp-server:latest": {} } } ``` -Or use it locally: - -```json -{ - "features": { - "./features/src/wb-mcp-server": { - "username": "vscode", - "userHomeDir": "/home/vscode" - } - } -} -``` - -## Available Tools - -The MCP server exposes the following tools: - -1. **wb_status** - Get current workspace and server status -2. **wb_workspace_list** - List all Workbench workspaces -3. **wb_resource_list** - List resources in the current workspace -4. **wb_resource_describe** - Describe a specific resource -5. **wb_folder_tree** - Display folder structure as a tree -6. **wb_app_list** - List all applications in the workspace -7. **wb_app_describe** - Describe a specific application -8. **wb_execute** - Execute any custom wb command - -## Usage with Claude CLI +Rebuild your devcontainer. The server installs at `/opt/wb-mcp-server/wb-mcp-server`. -1. After the feature is installed, the MCP server binary is available at `/opt/wb-mcp-server/wb-mcp-server` +## Setup -2. Add the MCP server using the Claude CLI command: +### With Claude CLI ```bash claude mcp add --transport stdio wb -- /opt/wb-mcp-server/wb-mcp-server ``` -Alternatively, manually configure by editing `~/.claude.json` or `.mcp.json`: - -```json -{ - "mcpServers": { - "wb": { - "type": "stdio", - "command": "/opt/wb-mcp-server/wb-mcp-server" - } - } -} -``` - -3. Verify the server is configured: +### With Gemini CLI ```bash -claude mcp list +gemini mcp add --scope user wb /opt/wb-mcp-server/wb-mcp-server ``` -4. Start Claude, and it will automatically connect to the MCP server. Claude can now interact with your Workbench environment: +## Quick Examples -``` -You: List all my workspaces -Claude: [calls wb_workspace_list tool] Here are your workspaces... +### Find Available Data -You: What resources are in my current workspace? -Claude: [calls wb_resource_list tool] Here are the resources... +``` +"List all data collections I can access" ``` -## Usage with Gemini CLI - -1. After the feature is installed, the MCP server binary is available at `/opt/wb-mcp-server/wb-mcp-server` +Uses `workspace_list_data_collections` to find data collection workspaces. -2. Add the MCP server using the Gemini CLI command: +### Explore Schema -```bash -gemini mcp add --scope user wb /opt/wb-mcp-server/wb-mcp-server ``` - -Alternatively, manually configure by editing `~/.gemini/settings.json` or `.gemini/settings.json`: - -```json -{ - "mcpServers": { - "wb": { - "command": "/opt/wb-mcp-server/wb-mcp-server" - } - } -} +"What entities are in the AoU_2024 underlay? Show me the person entity attributes" ``` -3. Verify the server is configured: +Uses `underlay_list_entities` and `underlay_get_entity`. -```bash -gemini mcp list -``` - -4. Start Gemini, and it will automatically connect to the MCP server. Gemini can now interact with your Workbench environment: +### Create Simple Cohort ``` -You: List all my workspaces -Gemini: [calls wb_workspace_list tool] Here are your workspaces... - -You: What resources are in my current workspace? -Gemini: [calls wb_resource_list tool] Here are the resources... +"Create a cohort called 'seniors' with patients over 65 from the AoU_2024 data collection (workspace ID: abc-123) in my workspace (xyz-456)" ``` -## Usage with Other MCP Clients +Uses `filter_build_attribute` and `cohort_create_in_workspace`. -The server implements the standard MCP protocol over stdio. Any MCP-compatible client can connect by running: +### Create Complex Cohort -```bash -/opt/wb-mcp-server/wb-mcp-server ``` - -The server reads JSON-RPC requests from stdin and writes responses to stdout. - -## Manual Testing - -You can test the server manually: - -```bash -echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"test","version":"1.0"}}}' | /opt/wb-mcp-server/wb-mcp-server +"Create a cohort of diabetic seniors: patients over 65 with Type 2 Diabetes (concept 201826) from AoU_2024. Data collection: abc-123, target workspace: xyz-456, name: 'diabetic-seniors'" ``` -## Environment Variables +Uses `filter_build_attribute`, `filter_build_relationship`, `filter_build_boolean_logic`, and `cohort_create_in_workspace`. -After installation, these environment variables are available: +## How It Works -- `WB_MCP_SERVER_BIN`: Path to the MCP server binary -- `WB_MCP_CONFIG`: Path to the example MCP configuration file +### Authentication +- Auto-fetches bearer token from `wb auth print-access-token` +- Refreshes every 55 minutes +- Gets API URLs from `wb status` -## Prerequisites +### Data Collections +Data collection workspaces contain underlays (data models): +- Data collection workspace ID = underlay ID +- Property `"terra-type": "data-collection"` +- Property `"terra-dx-underlay-name"` = underlay name (e.g., "AoU_2024") -- The `wb` CLI must be installed and authenticated in the container -- The user running the MCP server must have access to `wb` commands +### Cohort Creation Flow +1. User has READ access to data collection workspace +2. User has WRITER access to target workspace +3. Server creates: + - Study in Data Explorer (if doesn't exist) + - Cohort in that study + - Controlled resource in workspace -## Options +### Filter Structure +Filters use Data Explorer's filter format: +- **Attribute**: `age > 65`, `gender = 'male'` +- **Relationship**: `persons who have condition = diabetes` +- **Boolean Logic**: Combine with AND/OR/NOT +- **Hierarchy**: All descendants of concept -- **username** (default: `root`): Container user that will run the MCP server -- **userHomeDir** (default: `/root`): Home directory of the container user -- **port** (default: `3000`): Reserved for future use (currently stdio-based) - -## Security Notes - -- This MCP server is designed for **local use only** -- It does not implement authentication (relies on local `wb` authentication) -- Do not expose this server to untrusted networks -- The server runs with the same permissions as the user who starts it +Filter builders output correct JSON for you. ## Troubleshooting -### Server not found - -Make sure the feature is installed and the binary exists: - +### "Error: failed to get access token" ```bash -ls -l /opt/wb-mcp-server/wb-mcp-server +wb auth login ``` -### wb command not found - -Ensure the Workbench CLI is installed and authenticated: - +### "API error (403)" +Check permissions: ```bash -wb status +wb workspace describe ``` +Need READER on data collections, WRITER on target workspace. -### Permission denied - -Check that the binary is executable: +### "Error: underlayName parameter is required" +First find underlay names: +``` +"List my data collections and show their underlay names" +``` +### Server not responding +Test directly: ```bash -chmod +x /opt/wb-mcp-server/wb-mcp-server +/opt/wb-mcp-server/wb-mcp-server +``` +Then send: +```json +{"jsonrpc":"2.0","id":1,"method":"tools/list"} ``` -## Development - -The MCP server is written in Go and built during feature installation. Source code: - -- `main.go`: Server implementation -- `go.mod`: Go module definition -- `install.sh`: Installation script +## Requirements -To modify the server, edit `main.go` and rebuild your dev container. +- Workbench CLI (`wb`) installed +- Authenticated (`wb auth login`) +- Access to data collections and workspaces diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index 8080bee4b..6ed36c5cd 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -2,11 +2,15 @@ package main import ( "bufio" + "bytes" "encoding/json" "fmt" + "io" + "net/http" "os" "os/exec" "strings" + "time" ) // MCP Protocol structures @@ -82,285 +86,2630 @@ type ContentItem struct { Text string `json:"text"` } +// Global variables +var ( + workspaceBaseURL string + dataExplorerURL string + httpClient = &http.Client{Timeout: 60 * time.Second} +) + +// Tool definitions var wbTools = []Tool{ { Name: "wb_status", - Description: "Get the current workspace and server status", + Description: "Get workspace and server status using wb CLI", + InputSchema: InputSchema{Type: "object", Properties: map[string]interface{}{}}, + }, + { + Name: "wb_workspace_list", + Description: "List all workspaces using wb CLI", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "format": map[string]interface{}{ + "type": "string", + "enum": []string{"json", "text"}, + }, + }, + }, + }, + { + Name: "wb_execute", + Description: "Execute any wb command (without 'wb' prefix)", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "command": map[string]interface{}{"type": "string"}, + }, + Required: []string{"command"}, + }, + }, + + { + Name: "workspace_create", + Description: "Create a new workspace. Use this when user wants to create a new workspace for their research or project. Creates both the workspace metadata and backing cloud resources (e.g., Google Cloud project). Returns the new workspace ID.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "id": map[string]interface{}{"type": "string", "description": "User-facing workspace ID (must be unique)"}, + "podId": map[string]interface{}{"type": "string", "description": "Pod ID (required) - get from pod_list"}, + "name": map[string]interface{}{"type": "string", "description": "Display name for the workspace"}, + "description": map[string]interface{}{"type": "string", "description": "Workspace description"}, + "organizationId": map[string]interface{}{"type": "string", "description": "Organization ID (optional)"}, + }, + Required: []string{"id", "podId"}, + }, + }, + { + Name: "workspace_delete", + Description: "Delete a workspace. Use this when user wants to permanently remove a workspace. WARNING: This deletes all resources in the workspace. Requires OWNER role. User should confirm before executing.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID to delete"}, + }, + Required: []string{"workspaceId"}, + }, + }, + { + Name: "workspace_update", + Description: "Update workspace metadata (name, description). Use this when user wants to change workspace display name or description without modifying resources.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + "name": map[string]interface{}{"type": "string", "description": "New display name"}, + "description": map[string]interface{}{"type": "string", "description": "New description"}, + }, + Required: []string{"workspaceId"}, + }, + }, + { + Name: "workspace_duplicate", + Description: "Duplicate an existing workspace. Use this when user wants to copy a workspace structure (including resources and folder organization) to a new workspace. Useful for creating similar workspaces or templates.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "sourceWorkspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID to duplicate from"}, + "destWorkspaceId": map[string]interface{}{"type": "string", "description": "New workspace ID"}, + "name": map[string]interface{}{"type": "string", "description": "Name for new workspace"}, + }, + Required: []string{"sourceWorkspaceId", "destWorkspaceId"}, + }, + }, + { + Name: "workspace_set_property", + Description: "Set custom properties on a workspace. Use this for adding metadata tags or configuration values. Properties are key-value pairs used for organization, categorization, or workspace configuration.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + "key": map[string]interface{}{"type": "string", "description": "Property key"}, + "value": map[string]interface{}{"type": "string", "description": "Property value"}, + }, + Required: []string{"workspaceId", "key", "value"}, + }, + }, + { + Name: "workspace_delete_property", + Description: "Delete a custom property from a workspace. Use this to remove previously set metadata or configuration.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + "key": map[string]interface{}{"type": "string", "description": "Property key to delete"}, + }, + Required: []string{"workspaceId", "key"}, + }, + }, + { + Name: "workspace_add_user", + Description: "Grant a user access to a workspace. Use this when sharing a workspace with collaborators. Specify role (READER, WRITER, or OWNER) to control access level. READER can view, WRITER can modify, OWNER can manage users and delete.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email address"}, + "role": map[string]interface{}{"type": "string", "enum": []string{"READER", "WRITER", "OWNER"}, "description": "Access role"}, + }, + Required: []string{"workspaceId", "email", "role"}, + }, + }, + { + Name: "workspace_remove_user", + Description: "Revoke a user's access to a workspace. Use this to remove collaborators or revoke access. Requires OWNER role.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email to remove"}, + }, + Required: []string{"workspaceId", "email"}, + }, + }, + { + Name: "workspace_list_users", + Description: "List all users with access to a workspace and their roles. Use this to see who has access and what level of permissions they have.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID"}, + }, + Required: []string{"workspaceId"}, + }, + }, + + { + Name: "resource_create_bucket", + Description: "Create a cloud storage bucket in the workspace. Use this when user needs file storage for data, results, or shared files. Creates a managed bucket that workspace users can access based on their roles.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID (used to reference in workspace)"}, + "bucketName": map[string]interface{}{"type": "string", "description": "Cloud bucket name (globally unique)"}, + "description": map[string]interface{}{"type": "string", "description": "Resource description"}, + }, + Required: []string{"resourceId", "bucketName"}, + }, + }, + { + Name: "resource_create_bq_dataset", + Description: "Create a BigQuery dataset in the workspace. Use this when user needs a database for structured data analysis, SQL queries, or data warehousing.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID"}, + "datasetId": map[string]interface{}{"type": "string", "description": "BigQuery dataset ID"}, + "description": map[string]interface{}{"type": "string", "description": "Resource description"}, + }, + Required: []string{"resourceId", "datasetId"}, + }, + }, + { + Name: "resource_delete", + Description: "Delete a resource from the workspace. Use this to remove buckets, datasets, or other resources. For controlled resources, this deletes the actual cloud resource. For references, only removes the reference.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID to delete"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "resource_update", + Description: "Update resource metadata (name, description). Use this to change how a resource is displayed or documented without modifying the underlying cloud resource.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID"}, + "name": map[string]interface{}{"type": "string", "description": "New display name"}, + "description": map[string]interface{}{"type": "string", "description": "New description"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "resource_add_reference", + Description: "Add a reference to an external cloud resource. Use this when user wants to reference data/resources from outside the workspace (e.g., a bucket in another project, a shared dataset). Creates a pointer without managing the resource.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID for the reference"}, + "resourceType": map[string]interface{}{"type": "string", "enum": []string{"gcs-bucket", "bq-dataset", "bq-table"}, "description": "Type of resource"}, + "path": map[string]interface{}{"type": "string", "description": "Cloud path (e.g., gs://bucket-name)"}, + "description": map[string]interface{}{"type": "string", "description": "Reference description"}, + }, + Required: []string{"resourceId", "resourceType", "path"}, + }, + }, + { + Name: "resource_check_access", + Description: "Check if current user has access to a resource. Use this to verify permissions before attempting operations. Useful for debugging access issues or validating setup.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID to check"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "resource_move", + Description: "Move a resource to a different folder. Use this for organizing resources into logical groups within a workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID to move"}, + "folderId": map[string]interface{}{"type": "string", "description": "Destination folder ID"}, + }, + Required: []string{"resourceId", "folderId"}, + }, + }, + + { + Name: "folder_create", + Description: "Create a folder in the workspace. Use this to organize resources into logical groups (e.g., 'data', 'results', 'notebooks'). Folders help maintain clean workspace organization.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "folderId": map[string]interface{}{"type": "string", "description": "Folder ID (must be unique in workspace)"}, + "displayName": map[string]interface{}{"type": "string", "description": "Display name for folder"}, + "description": map[string]interface{}{"type": "string", "description": "Folder description"}, + "parentId": map[string]interface{}{"type": "string", "description": "Parent folder ID (for nested folders)"}, + }, + Required: []string{"folderId", "displayName"}, + }, + }, + { + Name: "folder_delete", + Description: "Delete a folder. Use this to remove folders no longer needed. NOTE: Folder must be empty (move or delete resources first).", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "folderId": map[string]interface{}{"type": "string", "description": "Folder ID to delete"}, + }, + Required: []string{"folderId"}, + }, + }, + { + Name: "folder_update", + Description: "Update folder metadata (name, description). Use this to rename folders or update descriptions for better organization.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "folderId": map[string]interface{}{"type": "string", "description": "Folder ID"}, + "displayName": map[string]interface{}{"type": "string", "description": "New display name"}, + "description": map[string]interface{}{"type": "string", "description": "New description"}, + }, + Required: []string{"folderId"}, + }, + }, + { + Name: "folder_list_tree", + Description: "Show folder hierarchy as a tree. Use this to visualize workspace organization and understand the folder structure.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + + { + Name: "group_create", + Description: "Create a user group. Use this when managing multiple users with same access needs. Groups simplify permission management - grant access to group instead of individual users.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "groupId": map[string]interface{}{"type": "string", "description": "Unique group ID"}, + "name": map[string]interface{}{"type": "string", "description": "Group display name"}, + "description": map[string]interface{}{"type": "string", "description": "Group description"}, + }, + Required: []string{"groupId", "name"}, + }, + }, + { + Name: "group_delete", + Description: "Delete a user group. Use this to remove groups no longer needed. Users in the group lose group-based permissions.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "groupId": map[string]interface{}{"type": "string", "description": "Group ID to delete"}, + }, + Required: []string{"groupId"}, + }, + }, + { + Name: "group_list", + Description: "List all groups the current user has a role on. Use this to see available groups for permission management.", InputSchema: InputSchema{ Type: "object", Properties: map[string]interface{}{}, }, }, { - Name: "wb_workspace_list", - Description: "List all Workbench workspaces", + Name: "group_describe", + Description: "Get detailed information about a group (members, roles, metadata). Use this to see who belongs to a group and their access levels.", InputSchema: InputSchema{ Type: "object", Properties: map[string]interface{}{ - "format": map[string]interface{}{ - "type": "string", - "description": "Output format (json, text)", - "enum": []string{"json", "text"}, - }, + "groupId": map[string]interface{}{"type": "string", "description": "Group ID"}, }, + Required: []string{"groupId"}, }, }, { - Name: "wb_resource_list", - Description: "List resources in the current workspace", + Name: "group_add_user", + Description: "Add a user to a group. Use this when adding collaborators to a group for shared access management.", InputSchema: InputSchema{ Type: "object", Properties: map[string]interface{}{ - "type": map[string]interface{}{ - "type": "string", - "description": "Resource type to filter by", - }, - "format": map[string]interface{}{ - "type": "string", - "description": "Output format (json, text)", - "enum": []string{"json", "text"}, - }, + "groupId": map[string]interface{}{"type": "string", "description": "Group ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email to add"}, + "role": map[string]interface{}{"type": "string", "enum": []string{"MEMBER", "ADMIN"}, "description": "Role in group"}, }, + Required: []string{"groupId", "email", "role"}, }, }, { - Name: "wb_resource_describe", - Description: "Describe a specific resource in the workspace", + Name: "group_remove_user", + Description: "Remove a user from a group. Use this to revoke group membership and associated permissions.", InputSchema: InputSchema{ Type: "object", Properties: map[string]interface{}{ - "name": map[string]interface{}{ - "type": "string", - "description": "Name or ID of the resource", - }, - "format": map[string]interface{}{ - "type": "string", - "description": "Output format (json, text)", - "enum": []string{"json", "text"}, + "groupId": map[string]interface{}{"type": "string", "description": "Group ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email to remove"}, + }, + Required: []string{"groupId", "email"}, + }, + }, + + { + Name: "app_create", + Description: "Create a GCP Compute Engine application in the workspace. Use this to launch analysis environments like JupyterLab, RStudio, or VSCode. Applications provide interactive compute environments.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "appId": map[string]interface{}{"type": "string", "description": "Application ID"}, + "appConfig": map[string]interface{}{"type": "string", "description": "App config name. Valid values: jupyter-lab, r-analysis, visual-studio-code"}, + "machineType": map[string]interface{}{"type": "string", "description": "Machine type (e.g., 'n1-standard-4')"}, + "description": map[string]interface{}{"type": "string", "description": "Description of the app"}, + "location": map[string]interface{}{"type": "string", "description": "GCP location/zone"}, + }, + Required: []string{"appId", "appConfig"}, + }, + }, + { + Name: "app_delete", + Description: "Delete an application. Use this to remove applications no longer needed. Stops the application and deletes associated resources.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "appId": map[string]interface{}{"type": "string", "description": "Application ID to delete"}, + }, + Required: []string{"appId"}, + }, + }, + { + Name: "app_list", + Description: "List all applications in the workspace. Use this to see available applications, their status, and configuration.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "app_start", + Description: "Start a stopped application. Use this to resume an application that was stopped to save costs. Takes a few minutes to become ready.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "appId": map[string]interface{}{"type": "string", "description": "Application ID to start"}, + }, + Required: []string{"appId"}, + }, + }, + { + Name: "app_stop", + Description: "Stop a running application. Use this to pause an application to save compute costs. Data and state are preserved. Can be restarted later.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "appId": map[string]interface{}{"type": "string", "description": "Application ID to stop"}, + }, + Required: []string{"appId"}, + }, + }, + { + Name: "app_get_url", + Description: "Get the launch URL for an application. Use this to get the web address to access a running application (e.g., Jupyter notebook URL).", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "appId": map[string]interface{}{"type": "string", "description": "Application ID"}, + }, + Required: []string{"appId"}, + }, + }, + + { + Name: "auth_status", + Description: "Get current authentication status. Use this to check if user is logged in and see which account is active.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + + { + Name: "server_list", + Description: "List all available servers. Use this to see which server environments are available (dev, staging, production).", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "server_set", + Description: "Set which server to connect to. Use this to switch between different environments (e.g., from production to staging for testing).", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "serverName": map[string]interface{}{"type": "string", "description": "Server name to connect to"}, + }, + Required: []string{"serverName"}, + }, + }, + { + Name: "server_status", + Description: "Get server status and details. Use this to check server health and configuration information.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "server_list_regions", + Description: "List valid cloud regions for a platform. Use this when creating resources to see available regions.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "cloudPlatform": map[string]interface{}{"type": "string", "description": "Cloud platform (e.g., 'gcp', 'azure')"}, + }, + Required: []string{"cloudPlatform"}, + }, + }, + + { + Name: "pod_list", + Description: "List all pods. Use this to see available pods (environments/tenants) and their details.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "pod_describe", + Description: "Get detailed information about a pod. Use this to see pod configuration, users, and settings.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "podId": map[string]interface{}{"type": "string", "description": "Pod ID"}, + }, + Required: []string{"podId"}, + }, + }, + { + Name: "pod_role_list", + Description: "List all user roles in a pod. Use this to see who has access to a pod and their permission levels.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "organizationId": map[string]interface{}{"type": "string", "description": "Organization ID"}, + "podId": map[string]interface{}{"type": "string", "description": "Pod ID"}, + }, + Required: []string{"organizationId", "podId"}, + }, + }, + { + Name: "pod_role_grant", + Description: "Grant a user a role in a pod. Use this when adding users to a pod with specific permissions.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "organizationId": map[string]interface{}{"type": "string", "description": "Organization ID"}, + "podId": map[string]interface{}{"type": "string", "description": "Pod ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email"}, + "role": map[string]interface{}{"type": "string", "description": "Role to grant (ADMIN, USER, SUPPORT)"}, + }, + Required: []string{"organizationId", "podId", "email", "role"}, + }, + }, + { + Name: "pod_role_revoke", + Description: "Revoke a user's role in a pod. Use this to remove a user's access to a pod.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "organizationId": map[string]interface{}{"type": "string", "description": "Organization ID"}, + "podId": map[string]interface{}{"type": "string", "description": "Pod ID"}, + "email": map[string]interface{}{"type": "string", "description": "User email"}, + "role": map[string]interface{}{"type": "string", "description": "Role to revoke (ADMIN, USER, SUPPORT)"}, + }, + Required: []string{"organizationId", "podId", "email", "role"}, + }, + }, + + { + Name: "organization_list", + Description: "List all organizations. Use this to see available organizations and their details.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + + { + Name: "resource_credentials", + Description: "Get temporary credentials for accessing a cloud resource. Use this when you need programmatic access credentials (e.g., for scripts, external tools).", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID"}, + "duration": map[string]interface{}{"type": "integer", "description": "Credential duration in seconds"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "resource_open_console", + Description: "Get cloud console link for a resource. Use this to provide users with a web link to view/manage the resource in the cloud provider's console.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "resource_list_tree", + Description: "List resources in tree view showing folder hierarchy. Use this to visualize workspace organization with resources grouped by folders.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "resource_mount", + Description: "Mount workspace bucket resources to local filesystem. Use this when user needs to access bucket contents as if they were local files.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "resource_unmount", + Description: "Unmount workspace bucket resources. Use this to disconnect previously mounted buckets from local filesystem.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + + { + Name: "notebook_start", + Description: "Start a stopped notebook instance. Use this to resume a notebook that was stopped to save costs. Convenience wrapper for app start.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "notebookId": map[string]interface{}{"type": "string", "description": "Notebook instance ID"}, + }, + Required: []string{"notebookId"}, + }, + }, + { + Name: "notebook_stop", + Description: "Stop a running notebook instance. Use this to pause a notebook to save compute costs. Convenience wrapper for app stop.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "notebookId": map[string]interface{}{"type": "string", "description": "Notebook instance ID"}, + }, + Required: []string{"notebookId"}, + }, + }, + { + Name: "notebook_launch", + Description: "Launch a running notebook instance. Use this to get the URL and open a notebook. Convenience wrapper for app launch.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "notebookId": map[string]interface{}{"type": "string", "description": "Notebook instance ID"}, + }, + Required: []string{"notebookId"}, + }, + }, + + { + Name: "cluster_start", + Description: "Start a stopped Dataproc cluster. Use this to resume a Spark cluster that was stopped to save costs.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "clusterId": map[string]interface{}{"type": "string", "description": "Cluster ID"}, + }, + Required: []string{"clusterId"}, + }, + }, + { + Name: "cluster_stop", + Description: "Stop a running Dataproc cluster. Use this to pause a Spark cluster to save compute costs.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "clusterId": map[string]interface{}{"type": "string", "description": "Cluster ID"}, + }, + Required: []string{"clusterId"}, + }, + }, + { + Name: "cluster_launch", + Description: "Launch Dataproc cluster proxy view. Use this to get the URL for accessing cluster monitoring and Spark UI.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "clusterId": map[string]interface{}{"type": "string", "description": "Cluster ID"}, + }, + Required: []string{"clusterId"}, + }, + }, + + { + Name: "workflow_list", + Description: "List all workflows. Use this to see available workflows in the workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + }, + Required: []string{"workspaceId"}, + }, + }, + { + Name: "workflow_create", + Description: "Create a new workflow. Use this when user wants to set up a workflow for data processing or analysis pipelines.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + "workflowId": map[string]interface{}{"type": "string", "description": "Workflow ID"}, + "bucketId": map[string]interface{}{"type": "string", "description": "BUCKET NAME (not UUID) - e.g., 'cohort_exports'. Get from workspace_list_resources metadata.name field."}, + "path": map[string]interface{}{"type": "string", "description": "Path to workflow definition file in bucket (e.g., 'workflows/myworkflow.wdl')"}, + "displayName": map[string]interface{}{"type": "string", "description": "Workflow display name"}, + "description": map[string]interface{}{"type": "string", "description": "Description of the workflow"}, + }, + Required: []string{"workspaceId", "workflowId", "bucketId", "path"}, + }, + }, + { + Name: "workflow_describe", + Description: "Get detailed information about a workflow. Use this to see workflow configuration and status.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + "workflowId": map[string]interface{}{"type": "string", "description": "Workflow ID"}, + }, + Required: []string{"workspaceId", "workflowId"}, + }, + }, + { + Name: "workflow_job_list", + Description: "List all workflow jobs. Use this to see job history, status, and details.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + { + Name: "workflow_job_describe", + Description: "Get detailed information about a workflow job. Use this to see job configuration, status, inputs, and outputs.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + "jobId": map[string]interface{}{"type": "string", "description": "Job ID"}, + }, + Required: []string{"workspaceId", "jobId"}, + }, + }, + { + Name: "workflow_job_run", + Description: "Start a workflow job. Use this to execute a workflow with specific inputs.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + "workflowId": map[string]interface{}{"type": "string", "description": "Workflow ID"}, + "outputBucketId": map[string]interface{}{"type": "string", "description": "BUCKET NAME (not UUID) for outputs - e.g., 'cohort_exports'"}, + "jobId": map[string]interface{}{"type": "string", "description": "Optional job ID"}, + "description": map[string]interface{}{"type": "string", "description": "Job description"}, + "outputPath": map[string]interface{}{"type": "string", "description": "Output path in bucket"}, + "inputs": map[string]interface{}{"type": "object", "description": "Job inputs as key-value pairs"}, + }, + Required: []string{"workspaceId", "workflowId", "outputBucketId"}, + }, + }, + { + Name: "workflow_job_cancel", + Description: "Cancel a running workflow job. Use this to stop a job that is in progress.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + "jobId": map[string]interface{}{"type": "string", "description": "Job ID"}, + }, + Required: []string{"workspaceId", "jobId"}, + }, + }, + + { + Name: "cromwell_generate_config", + Description: "Generate Cromwell configuration file. Use this when setting up Cromwell workflows to create the required config file.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "path": map[string]interface{}{"type": "string", "description": "Output path for cromwell.conf"}, + }, + Required: []string{"path"}, + }, + }, + { + Name: "workspace_configure_aws", + Description: "Generate AWS configuration file for workspace. Use this when workspace needs to access AWS resources.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "Workspace ID"}, + }, + Required: []string{"workspaceId"}, + }, + }, + { + Name: "resolve", + Description: "Resolve a resource to its cloud ID or path. Use this to get the actual cloud identifier (bucket name, dataset ID, etc.) for a workspace resource.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "resourceId": map[string]interface{}{"type": "string", "description": "Resource ID to resolve"}, + }, + Required: []string{"resourceId"}, + }, + }, + { + Name: "version", + Description: "Get the installed wb CLI version. Use this to check which version is installed or for troubleshooting.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, + + { + Name: "bq_execute", + Description: "Execute BigQuery command in workspace context. Use this to run bq CLI commands with workspace's BigQuery access.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "command": map[string]interface{}{"type": "string", "description": "BigQuery command (without 'bq' prefix)"}, + }, + Required: []string{"command"}, + }, + }, + { + Name: "gcloud_execute", + Description: "Execute gcloud command in workspace context. Use this to run gcloud CLI commands with workspace's GCP project.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "command": map[string]interface{}{"type": "string", "description": "gcloud command (without 'gcloud' prefix)"}, + }, + Required: []string{"command"}, + }, + }, + { + Name: "gsutil_execute", + Description: "Execute gsutil command in workspace context. Use this to run gsutil CLI commands for GCS operations.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "command": map[string]interface{}{"type": "string", "description": "gsutil command (without 'gsutil' prefix)"}, + }, + Required: []string{"command"}, + }, + }, + { + Name: "git_execute", + Description: "Execute git command in workspace context. Use this for git operations within workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "command": map[string]interface{}{"type": "string", "description": "git command (without 'git' prefix)"}, + }, + Required: []string{"command"}, + }, + }, + + { + Name: "workspace_list_all", + Description: "List all workspaces with optional property filters. Use properties={'terra-type': 'data-collection'} to find data collections with underlays, properties={'terra-dx-underlay-name': ''} to filter by underlay", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "properties": map[string]interface{}{"type": "object"}, + "limit": map[string]interface{}{"type": "integer", "default": 100}, + "offset": map[string]interface{}{"type": "integer", "default": 0}, + }, + }, + }, + { + Name: "workspace_get", + Description: "Get workspace details by ID. workspaceId is the user-facing ID (e.g., 'test-1599'), not the UUID.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID (e.g., 'test-1599')"}, + }, + Required: []string{"workspaceId"}, + }, + }, + { + Name: "workspace_list_resources", + Description: "List all resources in a workspace including cohorts, buckets, datasets, etc. workspaceId is the user-facing ID (e.g., 'test-1599'), not the UUID.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID (e.g., 'test-1599')"}, + "offset": map[string]interface{}{"type": "integer", "default": 0}, + "limit": map[string]interface{}{"type": "integer", "default": 100}, + }, + Required: []string{"workspaceId"}, + }, + }, + + { + Name: "underlay_list", + Description: "List all available underlays", + InputSchema: InputSchema{Type: "object", Properties: map[string]interface{}{}}, + }, + { + Name: "underlay_get_schema", + Description: "Get complete underlay schema with entities and attributes. This returns the raw schema. For cohort building, use underlay_list_criteria_selectors instead to get available criteria selectors.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "underlayName": map[string]interface{}{"type": "string"}, + }, + Required: []string{"underlayName"}, + }, + }, + { + Name: "underlay_list_entities", + Description: "List all entities in an underlay (e.g., Person, Condition)", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "underlayName": map[string]interface{}{"type": "string"}, + }, + Required: []string{"underlayName"}, + }, + }, + { + Name: "underlay_get_entity", + Description: "Get entity details including attributes and relationships", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "underlayName": map[string]interface{}{"type": "string"}, + "entityName": map[string]interface{}{"type": "string"}, + }, + Required: []string{"underlayName", "entityName"}, + }, + }, + { + Name: "underlay_list_criteria_selectors", + Description: `STEP 1 of cohort creation: Discover available criteria selectors for an underlay. + +Returns array of selectors, each with: +- name: Selector name (use in selectorOrModifierName) +- plugin: Plugin type (use in pluginName) +- pluginConfig: JSON string (copy to uiConfig when building criteria) +- category: Display category +- displayName: Human-readable name + +EXTRACT from each selector: +1. selector.name → save for selectorOrModifierName +2. selector.plugin → save for pluginName +3. selector.pluginConfig → save as uiConfig (keep as JSON string) + +For "entityGroup" plugin selectors: +- Parse pluginConfig to extract classificationEntityGroups[0].id (e.g., "currentDiagnosesPerson") +- This is the entityGroup value needed in selectionData +- Parse columns to find entity's ID field name for data_query_hints + +COMPLETE COHORT WORKFLOW: +STEP 1: Call underlay_list_criteria_selectors(underlayName) → get selectors +STEP 2: Call cohort_create_in_workspace(workspaceId, underlayId, underlayName, name) WITHOUT criteriaJson → creates cohort with all participants +STEP 3: Extract studyId and cohortId from response +STEP 4: Call data_query_hints(studyId, cohortId, entityName) → get entity codes/values AND numeric ranges +STEP 5: Build criteriaJson using selector info + codes/ranges from hints +STEP 6: Call cohort_update_criteria(studyId, cohortId, criteriaJson) → apply filters + +LEARNING CORRECT FORMATS: +Use study_list_cohorts to examine existing cohorts and see their actual criteriaGroupSections. +This is the BEST way to learn correct selectionData formats for each plugin type. + +selectionData format by plugin type (see proto definitions in data-explorer repo): +- "attribute": {"dataRanges":[{"min":,"max":}]} - BOTH min and max required as numbers +- "entityGroup": {"selected": [{"key": {"int64Key": }, "name": "", "entityGroup": ""}]} +- "multiAttribute": {"selected": [{"attribute": "", "dataRanges": [{"min":,"max":}]}]}`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "underlayName": map[string]interface{}{"type": "string", "description": "Underlay name"}, + }, + Required: []string{"underlayName"}, + }, + }, + + { + Name: "data_query_hints", + Description: `STEP 4 of cohort workflow: Discover entity codes, value distributions, and numeric ranges. + +Use this to find: +1. Entity codes for entityGroup filters (diagnosis IDs, medication IDs, etc.) +2. Enum values for categorical attributes +3. Numeric ranges (min/max) for numeric attributes like age + +INPUT: +- studyId, cohortId: From cohort_create_in_workspace response +- entityName: Entity to query (e.g., "person", "diagnoses", "medications") + +RESPONSE STRUCTURE - displayHints array with elements containing: +{ + "attribute": {"name": "", "dataType": "INT64|STRING|..."}, + "displayHint": { + "numericRangeHint": {"min": , "max": } // For numeric attributes + OR + "enumHint": {"enumHintValues": [...]} // For categorical attributes + } +} + +CRITICAL: For numeric attributes (like age): +- Response includes "numericRangeHint" with actual data min/max values +- Use these EXACT min/max values in your selectionData dataRanges +- BOTH min and max are REQUIRED in dataRanges (see DataRange proto) +- Adjust min or max to create your filter (e.g., if max=92, use min=66,max=92 for "over 65") + +For entityGroup attributes: +- Look for instances with ID fields +- Extract ID value for int64Key and name for display + +After getting hints, proceed to STEP 5: Build criteriaJson, then STEP 6: cohort_update_criteria.`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace response"}, + "cohortId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace response"}, + "entityName": map[string]interface{}{"type": "string", "description": "Entity name (e.g., 'diagnoses', 'medications', 'person')"}, + }, + Required: []string{"studyId", "cohortId", "entityName"}, + }, + }, + { + Name: "data_sample_instances", + Description: "Sample actual data from an entity with optional filters", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string"}, + "cohortId": map[string]interface{}{"type": "string"}, + "entityName": map[string]interface{}{"type": "string"}, + "includeAttributes": map[string]interface{}{"type": "array", "items": map[string]interface{}{"type": "string"}}, + "filter": map[string]interface{}{"type": "object"}, + "limit": map[string]interface{}{"type": "integer", "default": 50}, + }, + Required: []string{"studyId", "cohortId", "entityName"}, + }, + }, + { + Name: "study_list", + Description: `List all Data Explorer studies. Use this to find studyId for existing cohorts. + +WHEN TO USE: +- When you need to find studyId/cohortId for an existing cohort +- When you want to see what studies exist in the workspace +- BEFORE calling data_query_hints or cohort_update_criteria on existing cohorts + +RESPONSE contains array of studies with: +- id: The studyId (UUID) needed for other API calls +- displayName: Usually "Workspace: " +- properties.externalId: The workspace UUID +- created, createdBy, lastModified, lastModifiedBy + +WORKFLOW to find existing cohort IDs: +1. Call study_list to get all studies +2. For each study, call study_list_cohorts(studyId) to list cohorts +3. Find cohort by displayName or underlayName +4. Extract studyId and cohortId for use in other tools`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "offset": map[string]interface{}{"type": "integer", "default": 0, "description": "Number of items to skip"}, + "limit": map[string]interface{}{"type": "integer", "default": 50, "description": "Maximum items to return"}, + }, + }, + }, + { + Name: "study_list_cohorts", + Description: `List all cohorts in a Data Explorer study. Use this to find cohortId and view actual criteria. + +WHEN TO USE: +- After calling study_list to get a studyId +- When you want to see what cohorts exist in a study +- When you want to examine the actual criteriaGroupSections used in existing cohorts +- To learn correct selectionData formats by looking at working cohorts + +RESPONSE contains array of cohorts with: +- id: The cohortId (UUID) needed for data_query_hints, cohort_update_criteria +- underlayName: Which underlay this cohort uses +- displayName: Human-readable cohort name +- description: Cohort description +- criteriaGroupSections: The ACTUAL criteria used (great for learning correct formats!) +- created, createdBy, lastModified, lastModifiedBy + +LEARNING FROM EXISTING COHORTS: +The response shows the exact criteriaGroupSections that work. Look at: +- selectionData format for each plugin type +- How selectorOrModifierName is used +- How uiConfig is structured +This is the BEST way to learn correct formats - copy from working cohorts! + +WORKFLOW: +1. Call study_list to get studyId +2. Call THIS tool with studyId to list cohorts +3. Extract cohortId for the cohort you want to work with +4. Optionally: Study the criteriaGroupSections to learn correct formats`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "Study ID from study_list"}, + "offset": map[string]interface{}{"type": "integer", "default": 0, "description": "Number of items to skip"}, + "limit": map[string]interface{}{"type": "integer", "default": 50, "description": "Maximum items to return"}, + }, + Required: []string{"studyId"}, + }, + }, + + { + Name: "cohort_create_in_workspace", + Description: `STEP 2 of cohort workflow: Create cohort in workspace. + +TWO MODES: +1. WITHOUT criteriaJson (RECOMMENDED for new underlays): Creates cohort with all participants + - Use this to create initial cohort for discovering entity codes + - Then use data_query_hints to get codes + - Then use cohort_update_criteria to apply filters + +2. WITH criteriaJson: Creates cohort with filters already applied + - Only use if you already know all selector names and entity codes + +RESPONSE contains studyId and cohortId at top level: +{ + "studyId": "abc-123", + "cohortId": "def-456", + "resourceId": "...", + ... +} + +Extract these for next steps: +- studyId: Needed for data_query_hints and cohort_update_criteria +- cohortId: Needed for data_query_hints and cohort_update_criteria + +RECOMMENDED WORKFLOW (for unknown underlay): +1. Call underlay_list_criteria_selectors → get selectors +2. Call THIS tool WITHOUT criteriaJson → creates "all participants" cohort +3. Extract studyId and cohortId from response +4. Call data_query_hints(studyId, cohortId, entityName) → get entity codes +5. Build criteriaJson with discovered selectors and codes +6. Call cohort_update_criteria(studyId, cohortId, criteriaJson) → apply filters + +criteriaJson structure (if providing): +{ + "criteriaGroupSections": [{ + "id": "section-id", + "displayName": "Section Name", + "disabled": false, + "operator": "AND", + "excluded": false, + "firstBlockReducingOperator": "ANY", + "secondBlockReducingOperator": "ANY", + "secondBlockCriteriaGroups": [], + "criteriaGroups": [{ + "id": "group-id", + "disabled": false, + "criteria": [{ + "id": "criteria-id", + "pluginName": "", + "selectorOrModifierName": "", + "selectionData": "", + "uiConfig": "", + "pluginVersion": 0, + "tags": {}, + "enabled": true + }] + }] + }] +} + +Each criterion in separate criteriaGroup. See underlay_list_criteria_selectors for selectionData formats. +Use study_list_cohorts to examine working cohorts and learn correct formats by example.`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "workspaceId": map[string]interface{}{"type": "string", "description": "User-facing workspace ID (e.g., 'test-1599')"}, + "underlayId": map[string]interface{}{"type": "string"}, + "underlayName": map[string]interface{}{"type": "string"}, + "name": map[string]interface{}{"type": "string"}, + "displayName": map[string]interface{}{"type": "string"}, + "description": map[string]interface{}{"type": "string"}, + "criteriaJson": map[string]interface{}{"type": "string", "description": "Complete criteriaGroupSections JSON (see tool description for required structure)"}, + "folderId": map[string]interface{}{"type": "string"}, + }, + Required: []string{"workspaceId", "underlayId", "underlayName", "name"}, + }, + }, + { + Name: "cohort_update_criteria", + Description: `STEP 6 of cohort workflow: Apply filter criteria to existing cohort. + +This is the final step after discovering selectors, creating initial cohort, and querying entity codes. + +INPUT: +- studyId, cohortId: From cohort_create_in_workspace response +- criteriaGroupSections: Array of criteria group sections (see structure below) + +BUILD criteriaGroupSections array: +[{ + "id": "section-1", + "displayName": "Filters", + "disabled": false, + "operator": "AND", + "excluded": false, + "firstBlockReducingOperator": "ANY", + "secondBlockReducingOperator": "ANY", + "secondBlockCriteriaGroups": [], + "criteriaGroups": [ + { + "id": "group-1", + "disabled": false, + "criteria": [{ + "id": "crit-1", + "pluginName": "", + "selectorOrModifierName": "", + "selectionData": "", + "uiConfig": "", + "pluginVersion": 0, + "tags": {}, + "enabled": true + }] + } + ] +}] + +BUILDING selectionData by plugin type: +1. "attribute" plugin (numeric attributes like age): + - Format: "{\"dataRanges\":[{\"min\":66,\"max\":92}]}" + - BOTH min and max REQUIRED as numbers (not strings) + - Get min/max from data_query_hints numericRangeHint response + - Escape as JSON string when putting in criteria + +2. "entityGroup" plugin (diagnoses, medications): + - Use codes from data_query_hints response + - Format: "{\"selected\":[{\"key\":{\"int64Key\":CODE},\"name\":\"NAME\",\"entityGroup\":\"GROUP_ID\"}]}" + - int64Key value must be NUMBER not string + - entityGroup ID from selector's pluginConfig classificationEntityGroups[0].id + +3. "multiAttribute" plugin: + - Format: "{\"selected\":[{\"attribute\":\"ATTR\",\"dataRanges\":[{\"min\":NUM,\"max\":NUM}]}]}" + - For categorical: "{\"selected\":[{\"attribute\":\"ATTR\",\"values\":[{\"value\":{\"stringVal\":\"VALUE\"}}]}]}" + +CRITICAL: +- Each criterion goes in its own criteriaGroup. Operator "AND" means all groups must match. +- Use study_list_cohorts to examine working cohorts and learn correct formats.`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace response"}, + "cohortId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace response"}, + "criteriaGroupSections": map[string]interface{}{"type": "array", "description": "Array of criteria group sections"}, + "displayName": map[string]interface{}{"type": "string", "description": "Optional: Update cohort display name"}, + "description": map[string]interface{}{"type": "string", "description": "Optional: Update cohort description"}, + }, + Required: []string{"studyId", "cohortId"}, + }, + }, + { + Name: "cohort_count_instances", + Description: "Count instances matching cohort criteria", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string"}, + "cohortId": map[string]interface{}{"type": "string"}, + "entity": map[string]interface{}{"type": "string"}, + "groupByAttributes": map[string]interface{}{"type": "array", "items": map[string]interface{}{"type": "string"}}, + }, + Required: []string{"studyId", "cohortId"}, + }, + }, + + { + Name: "export_list_models", + Description: `List available export models for an underlay. + +Export models define how cohort data can be exported to different formats (CSV, IPYNB, etc.). + +RESPONSE contains array of export models with: +- name: Export model identifier (use in export_cohort) +- displayName: Human-readable name +- description: What this export model does +- numPrimaryEntityCap: Maximum number of entities that can be exported`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "underlayName": map[string]interface{}{"type": "string", "description": "Underlay name"}, + }, + Required: []string{"underlayName"}, + }, + }, + { + Name: "export_describe", + Description: `Describe what will be included in a cohort export. + +Shows which entities and attributes will be exported based on cohort variable set or all criteria. + +INPUT: +- studyId, cohortId: From cohort_create_in_workspace +- allCriteriaFromCohort: If true, exports all criteria; if false (default), exports variable set`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "cohortId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "allCriteriaFromCohort": map[string]interface{}{"type": "boolean", "description": "Export all criteria (true) or variable set (false)"}, + }, + Required: []string{"studyId", "cohortId"}, + }, + }, + { + Name: "export_preview", + Description: `Preview what data will be exported before running the actual export. + +Shows sample instances that will be included in the export. + +INPUT: +- studyId, cohortId: From cohort_create_in_workspace +- exportModel: Export model name from export_list_models +- entityName: Entity to preview (e.g., "person", "diagnoses") +- limit: Max instances to preview (default: 20, max: 20) +- inputs: Optional parameters required by export model`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "cohortId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "exportModel": map[string]interface{}{"type": "string", "description": "Export model name from export_list_models"}, + "entityName": map[string]interface{}{"type": "string", "description": "Entity to preview"}, + "limit": map[string]interface{}{"type": "integer", "description": "Max instances (default: 20)", "maximum": 20}, + "inputs": map[string]interface{}{"type": "object", "description": "Export model input parameters"}, + }, + Required: []string{"studyId", "cohortId"}, + }, + }, + { + Name: "export_cohort", + Description: `Export cohort data using specified export model. + +Creates downloadable files (CSV, IPYNB, etc.) with cohort data. + +INPUT: +- studyId, cohortId: From cohort_create_in_workspace +- exportRequests: Array of export requests, each with: + - exportModel: Model name from export_list_models (REQUIRED) + - inputs: Model-specific parameters (optional) + - includeAnnotations: Include review annotations (default: true) + - compressFiles: Compress output files (default: true) + +RESPONSE contains array of export results with: +- status: "SUCCEEDED" or "FAILED" +- links: Download URLs for exported files +- error: Error message if failed + +WORKFLOW: +1. Call export_list_models to see available models +2. Call export_preview to preview what will be exported +3. Call THIS tool to create the export +4. Use links from response to download files`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "studyId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "cohortId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace"}, + "exportRequests": map[string]interface{}{ + "type": "array", + "description": "Array of export requests", + "items": map[string]interface{}{ + "type": "object", + "properties": map[string]interface{}{ + "exportModel": map[string]interface{}{"type": "string", "description": "Export model name"}, + "inputs": map[string]interface{}{"type": "object", "description": "Model input parameters"}, + "includeAnnotations": map[string]interface{}{"type": "boolean", "default": true}, + "compressFiles": map[string]interface{}{"type": "boolean", "default": true}, + }, + "required": []string{"exportModel"}, + }, + }, + }, + Required: []string{"studyId", "cohortId", "exportRequests"}, + }, + }, + + { + Name: "filter_build_attribute", + Description: "Build attribute filter (e.g., age > 65). For cohort creation, use the criteriaGroupSections structure in cohort_create_in_workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "attribute": map[string]interface{}{"type": "string"}, + "operator": map[string]interface{}{"type": "string", "enum": []string{"EQUALS", "NOT_EQUALS", "LESS_THAN", "GREATER_THAN", "LESS_THAN_OR_EQUAL", "GREATER_THAN_OR_EQUAL", "IN", "NOT_IN", "BETWEEN", "IS_NULL", "IS_NOT_NULL"}}, + "value": map[string]interface{}{}, + "values": map[string]interface{}{"type": "array"}, + "dataType": map[string]interface{}{"type": "string", "enum": []string{"BOOLEAN", "INT64", "STRING", "DATE", "TIMESTAMP", "DOUBLE"}}, + }, + Required: []string{"attribute", "operator", "dataType"}, + }, + }, + { + Name: "filter_build_relationship", + Description: "Build relationship filter (e.g., persons with condition). For cohort creation, use the criteriaGroupSections structure in cohort_create_in_workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "relatedEntity": map[string]interface{}{"type": "string"}, + "subfilter": map[string]interface{}{"type": "object"}, + }, + Required: []string{"relatedEntity"}, + }, + }, + { + Name: "filter_build_boolean_logic", + Description: "Combine filters with AND/OR/NOT. For cohort creation, use the criteriaGroupSections structure in cohort_create_in_workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "operator": map[string]interface{}{"type": "string", "enum": []string{"AND", "OR", "NOT"}}, + "subfilters": map[string]interface{}{"type": "array"}, + }, + Required: []string{"operator", "subfilters"}, + }, + }, + { + Name: "filter_build_hierarchy", + Description: "Build hierarchy filter (e.g., all descendants of concept). For cohort creation, use the criteriaGroupSections structure in cohort_create_in_workspace.", + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "hierarchy": map[string]interface{}{"type": "string"}, + "operator": map[string]interface{}{"type": "string", "enum": []string{"CHILD_OF", "DESCENDANT_OF_INCLUSIVE", "IS_ROOT", "IS_MEMBER", "IS_LEAF"}}, + "values": map[string]interface{}{"type": "array"}, + }, + Required: []string{"hierarchy", "operator"}, + }, + }, +} + +func initializeConfig() error { + cmd := exec.Command("wb", "status", "--format=json") + output, err := cmd.CombinedOutput() + if err != nil { + // Fallback to production Verily URLs + workspaceBaseURL = "https://workbench.verily.com/api/wsm" + dataExplorerURL = "https://workbench.verily.com/api/de" + } else { + var status map[string]interface{} + if err := json.Unmarshal(output, &status); err == nil { + if server, ok := status["server"].(map[string]interface{}); ok { + // Get workspaceManagerUri from wb status output + if wsURL, ok := server["workspaceManagerUri"].(string); ok { + workspaceBaseURL = wsURL + // Derive dataExplorerUri from workspaceManagerUri + // Pattern: replace /api/wsm with /api/de + dataExplorerURL = strings.Replace(wsURL, "/api/wsm", "/api/de", 1) + } else { + // Fallback to production Verily URLs + workspaceBaseURL = "https://workbench.verily.com/api/wsm" + dataExplorerURL = "https://workbench.verily.com/api/de" + } + } + } + } + + fmt.Fprintf(os.Stderr, "Initialized - Workspace: %s, DataExplorer: %s\n", workspaceBaseURL, dataExplorerURL) + return nil +} + +func getToken() (string, error) { + cmd := exec.Command("wb", "auth", "print-access-token") + output, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("failed to get access token: %v", err) + } + return strings.TrimSpace(string(output)), nil +} + +func resolveWorkspaceId(workspaceId string) (string, error) { + listUrl := fmt.Sprintf("%s/api/workspaces/v1?offset=0&limit=5000", workspaceBaseURL) + listResp, apiErr := makeAPIRequest("GET", listUrl, nil) + if apiErr != nil { + return "", fmt.Errorf("failed to list workspaces: %w", apiErr) + } + var listData map[string]interface{} + if err := json.Unmarshal(listResp, &listData); err != nil { + return "", fmt.Errorf("error parsing workspace list: %v", err) + } + workspaces, ok := listData["workspaces"].([]interface{}) + if !ok { + return "", fmt.Errorf("workspaces not found in list response") + } + for _, ws := range workspaces { + wsMap, ok := ws.(map[string]interface{}) + if !ok { + continue + } + if wsMap["userFacingId"].(string) == workspaceId || wsMap["id"].(string) == workspaceId { + return wsMap["id"].(string), nil + } + } + return "", fmt.Errorf("workspace '%s' not found", workspaceId) +} + +func makeAPIRequest(method, url string, body interface{}) ([]byte, error) { + token, err := getToken() + if err != nil { + return nil, err + } + + var reqBody io.Reader + if body != nil { + jsonData, err := json.Marshal(body) + if err != nil { + return nil, fmt.Errorf("failed to marshal request: %v", err) + } + reqBody = bytes.NewBuffer(jsonData) + } + + req, err := http.NewRequest(method, url, reqBody) + if err != nil { + return nil, err + } + + req.Header.Set("Authorization", "Bearer "+token) + req.Header.Set("Content-Type", "application/json") + + resp, err := httpClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, fmt.Errorf("API error (%d): %s", resp.StatusCode, string(respBody)) + } + + return respBody, nil +} + +func executeWbCommand(args []string) (string, error) { + cmd := exec.Command("wb", args...) + output, err := cmd.CombinedOutput() + return string(output), err +} + +func handleCallTool(params CallToolParams) CallToolResult { + var output string + var err error + + switch params.Name { + case "wb_status": + output, err = executeWbCommand([]string{"status"}) + case "wb_workspace_list": + args := []string{"workspace", "list"} + if format, ok := params.Arguments["format"].(string); ok && format == "json" { + args = append(args, "--format=json") + } + output, err = executeWbCommand(args) + case "wb_execute": + command, ok := params.Arguments["command"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'command' required"}}, IsError: true} + } + output, err = executeWbCommand(strings.Fields(command)) + + case "workspace_list_all": + limit, offset := 100, 0 + if l, ok := params.Arguments["limit"].(float64); ok { + limit = int(l) + } + if o, ok := params.Arguments["offset"].(float64); ok { + offset = int(o) + } + body := map[string]interface{}{"limit": limit, "offset": offset} + if props, ok := params.Arguments["properties"].(map[string]interface{}); ok { + // Convert properties from map to array of key-value objects + var propsArray []map[string]string + for key, val := range props { + if strVal, ok := val.(string); ok { + propsArray = append(propsArray, map[string]string{"key": key, "value": strVal}) + } + } + body["properties"] = propsArray + } + respBody, apiErr := makeAPIRequest("POST", workspaceBaseURL+"/api/workspaces/v2/filtered", body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "workspace_get": + workspaceId, ok := params.Arguments["workspaceId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'workspaceId' required"}}, IsError: true} + } + // Resolve user-facing ID to UUID + workspaceUuid, err := resolveWorkspaceId(workspaceId) + if err != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: err.Error()}}, IsError: true} + } + url := fmt.Sprintf("%s/api/workspaces/v1/%s", workspaceBaseURL, workspaceUuid) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "workspace_list_resources": + workspaceId, ok := params.Arguments["workspaceId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'workspaceId' required"}}, IsError: true} + } + offset := 0 + if val, ok := params.Arguments["offset"].(float64); ok { + offset = int(val) + } + limit := 100 + if val, ok := params.Arguments["limit"].(float64); ok { + limit = int(val) + } + // Resolve user-facing ID to UUID + workspaceUuid, err := resolveWorkspaceId(workspaceId) + if err != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: err.Error()}}, IsError: true} + } + url := fmt.Sprintf("%s/api/workspaces/v1/%s/resources?offset=%d&limit=%d", workspaceBaseURL, workspaceUuid, offset, limit) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "underlay_list": + respBody, apiErr := makeAPIRequest("GET", dataExplorerURL+"/v2/underlays", nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "underlay_get_schema": + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + url := fmt.Sprintf("%s/v2/underlays/%s", dataExplorerURL, underlayName) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "underlay_list_entities": + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + url := fmt.Sprintf("%s/v2/underlays/%s/entities", dataExplorerURL, underlayName) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "underlay_get_entity": + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + entityName, ok := params.Arguments["entityName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'entityName' required"}}, IsError: true} + } + url := fmt.Sprintf("%s/v2/underlays/%s/entities/%s", dataExplorerURL, underlayName, entityName) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "underlay_list_criteria_selectors": + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + // Get the schema + url := fmt.Sprintf("%s/v2/underlays/%s", dataExplorerURL, underlayName) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + break + } + + // Parse the schema + var schema map[string]interface{} + if err := json.Unmarshal(respBody, &schema); err != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Error parsing schema: %v", err)}}, IsError: true} + } + + // Extract criteria selectors from serializedConfiguration + serializedConfig, ok := schema["serializedConfiguration"].(map[string]interface{}) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: serializedConfiguration not found"}}, IsError: true} + } + + criteriaSelectorsRaw, ok := serializedConfig["criteriaSelectors"].([]interface{}) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: criteriaSelectors not found"}}, IsError: true} + } + + // Parse each selector (they are JSON strings) + var selectors []map[string]interface{} + for _, selectorRaw := range criteriaSelectorsRaw { + selectorStr, ok := selectorRaw.(string) + if !ok { + continue + } + var selector map[string]interface{} + if err := json.Unmarshal([]byte(selectorStr), &selector); err != nil { + continue + } + + // Extract useful fields for agents + result := map[string]interface{}{ + "name": selector["name"], + "displayName": selector["displayName"], + "plugin": selector["plugin"], + } + + if pluginConfig, ok := selector["pluginConfig"].(string); ok { + result["pluginConfig"] = pluginConfig + } + + if display, ok := selector["display"].(map[string]interface{}); ok { + if category, ok := display["category"].(string); ok { + result["category"] = category + } + } + + selectors = append(selectors, result) + } + + outputBytes, _ := json.MarshalIndent(map[string]interface{}{"selectors": selectors}, "", " ") + output = string(outputBytes) + + case "data_query_hints": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + entityName, ok := params.Arguments["entityName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'entityName' required"}}, IsError: true} + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/entities/%s/hints", dataExplorerURL, studyId, cohortId, entityName) + respBody, apiErr := makeAPIRequest("POST", url, map[string]interface{}{}) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "data_sample_instances": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + entityName, ok := params.Arguments["entityName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'entityName' required"}}, IsError: true} + } + body := map[string]interface{}{"limit": 50} + if attrs, ok := params.Arguments["includeAttributes"].([]interface{}); ok { + body["includeAttributes"] = attrs + } + if filter, ok := params.Arguments["filter"].(map[string]interface{}); ok { + body["filter"] = filter + } + if limit, ok := params.Arguments["limit"].(float64); ok { + body["limit"] = int(limit) + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/entities/%s/instances", dataExplorerURL, studyId, cohortId, entityName) + respBody, apiErr := makeAPIRequest("POST", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "study_list": + offset, limit := 0, 50 + if o, ok := params.Arguments["offset"].(float64); ok { + offset = int(o) + } + if l, ok := params.Arguments["limit"].(float64); ok { + limit = int(l) + } + url := fmt.Sprintf("%s/v2/studies?offset=%d&limit=%d", dataExplorerURL, offset, limit) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "study_list_cohorts": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + offset, limit := 0, 50 + if o, ok := params.Arguments["offset"].(float64); ok { + offset = int(o) + } + if l, ok := params.Arguments["limit"].(float64); ok { + limit = int(l) + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts?offset=%d&limit=%d", dataExplorerURL, studyId, offset, limit) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "cohort_create_in_workspace": + workspaceId, ok := params.Arguments["workspaceId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'workspaceId' required"}}, IsError: true} + } + _, ok = params.Arguments["underlayId"].(string) // underlayId kept for validation but not used + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayId' required"}}, IsError: true} + } + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + name, ok := params.Arguments["name"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'name' required"}}, IsError: true} + } + displayName := name + if dn, ok := params.Arguments["displayName"].(string); ok { + displayName = dn + } + description := "" + if desc, ok := params.Arguments["description"].(string); ok { + description = desc + } + + // Step 1: Create cohort in Data Explorer + createBody := map[string]interface{}{ + "studyCreateInfo": map[string]interface{}{ + "displayName": displayName + " Study", + }, + "cohortCreateInfo": map[string]interface{}{ + "underlayName": underlayName, + "displayName": displayName, + "description": description, + }, + } + createResp, apiErr := makeAPIRequest("POST", dataExplorerURL+"/v2/createCohortInStudy", createBody) + if apiErr != nil { + err = fmt.Errorf("Step 1 failed (create cohort): %w", apiErr) + break + } + + // Parse response to get studyId and cohortId + var createResult map[string]interface{} + if err := json.Unmarshal(createResp, &createResult); err != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Error parsing create response: %v", err)}}, IsError: true} + } + study, _ := createResult["study"].(map[string]interface{}) + cohort, _ := createResult["cohort"].(map[string]interface{}) + studyId, _ := study["id"].(string) + cohortId, _ := cohort["id"].(string) + + // Step 2: Update criteria if provided + if criteriaJson, ok := params.Arguments["criteriaJson"].(string); ok && criteriaJson != "" { + var updateBody interface{} + if unmarshalErr := json.Unmarshal([]byte(criteriaJson), &updateBody); unmarshalErr != nil { + err = fmt.Errorf("Step 2 failed (parse criteria): %w", unmarshalErr) + break + } + _, apiErr = makeAPIRequest("PATCH", fmt.Sprintf("%s/v2/studies/%s/cohorts/%s", dataExplorerURL, studyId, cohortId), updateBody) + if apiErr != nil { + err = fmt.Errorf("Step 2 failed (update criteria): %w", apiErr) + break + } + } + + // Step 3: Save cohort to workspace + // Resolve user-facing ID to UUID + workspaceUuid, err := resolveWorkspaceId(workspaceId) + if err != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Step 3 failed: %v", err)}}, IsError: true} + } + + saveBody := map[string]interface{}{ + "common": map[string]interface{}{ + "displayName": displayName, + "description": description, + "accessScope": "SHARED_ACCESS", + "managedBy": "USER", + "cloningInstructions": "COPY_RESOURCE", + }, + "dataExplorerCohort": map[string]interface{}{ + "studyId": studyId, + "cohortId": cohortId, + }, + } + if folderId, ok := params.Arguments["folderId"].(string); ok { + saveBody["common"].(map[string]interface{})["folderId"] = folderId + } + saveUrl := fmt.Sprintf("%s/api/workspaces/v1/%s/resources/controlled/data-explorer/cohort/save", workspaceBaseURL, workspaceUuid) + respBody, apiErr := makeAPIRequest("POST", saveUrl, saveBody) + if apiErr != nil { + err = fmt.Errorf("Step 3 failed (save to workspace): %w", apiErr) + } else { + // Parse workspace response and add studyId/cohortId at top level for easy extraction + var workspaceResp map[string]interface{} + if err := json.Unmarshal(respBody, &workspaceResp); err == nil { + workspaceResp["studyId"] = studyId + workspaceResp["cohortId"] = cohortId + if modifiedResp, err := json.Marshal(workspaceResp); err == nil { + output = string(modifiedResp) + } else { + output = string(respBody) + } + } else { + output = string(respBody) + } + } + + case "cohort_update_criteria": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + body := map[string]interface{}{} + if criteria, ok := params.Arguments["criteriaGroupSections"]; ok { + body["criteriaGroupSections"] = criteria + } + if displayName, ok := params.Arguments["displayName"].(string); ok { + body["displayName"] = displayName + } + if description, ok := params.Arguments["description"].(string); ok { + body["description"] = description + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s", dataExplorerURL, studyId, cohortId) + respBody, apiErr := makeAPIRequest("PATCH", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "cohort_count_instances": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + body := map[string]interface{}{"groupByAttributes": []string{}} + if entity, ok := params.Arguments["entity"].(string); ok { + body["entity"] = entity + } + if attrs, ok := params.Arguments["groupByAttributes"].([]interface{}); ok { + body["groupByAttributes"] = attrs + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/counts", dataExplorerURL, studyId, cohortId) + respBody, apiErr := makeAPIRequest("POST", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "export_list_models": + underlayName, ok := params.Arguments["underlayName"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'underlayName' required"}}, IsError: true} + } + url := fmt.Sprintf("%s/v2/underlays/%s/exportModels", dataExplorerURL, underlayName) + respBody, apiErr := makeAPIRequest("GET", url, nil) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "export_describe": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + body := map[string]interface{}{} + if allCriteria, ok := params.Arguments["allCriteriaFromCohort"].(bool); ok { + body["allCriteriaFromCohort"] = allCriteria + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/describeExport", dataExplorerURL, studyId, cohortId) + respBody, apiErr := makeAPIRequest("POST", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "export_preview": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + body := map[string]interface{}{} + if exportModel, ok := params.Arguments["exportModel"].(string); ok { + body["exportModel"] = exportModel + } + if entityName, ok := params.Arguments["entityName"].(string); ok { + body["entityName"] = entityName + } + if limit, ok := params.Arguments["limit"].(float64); ok { + body["limit"] = int(limit) + } else { + body["limit"] = 20 + } + if inputs, ok := params.Arguments["inputs"].(map[string]interface{}); ok { + body["inputs"] = inputs + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/previewExport", dataExplorerURL, studyId, cohortId) + respBody, apiErr := makeAPIRequest("POST", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "export_cohort": + studyId, ok := params.Arguments["studyId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'studyId' required"}}, IsError: true} + } + cohortId, ok := params.Arguments["cohortId"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'cohortId' required"}}, IsError: true} + } + exportRequests, ok := params.Arguments["exportRequests"].([]interface{}) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'exportRequests' required"}}, IsError: true} + } + body := map[string]interface{}{ + "exportRequests": exportRequests, + } + url := fmt.Sprintf("%s/v2/studies/%s/cohorts/%s/export", dataExplorerURL, studyId, cohortId) + respBody, apiErr := makeAPIRequest("POST", url, body) + if apiErr != nil { + err = apiErr + } else { + output = string(respBody) + } + + case "filter_build_attribute": + attribute, ok := params.Arguments["attribute"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'attribute' required"}}, IsError: true} + } + operator, ok := params.Arguments["operator"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'operator' required"}}, IsError: true} + } + dataType, ok := params.Arguments["dataType"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'dataType' required"}}, IsError: true} + } + filter := map[string]interface{}{ + "filterType": "ATTRIBUTE", + "filterUnion": map[string]interface{}{ + "attributeFilter": map[string]interface{}{ + "attribute": attribute, + "operator": operator, }, }, - Required: []string{"name"}, - }, - }, - { - Name: "wb_folder_tree", - Description: "Display folder structure as a tree", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]interface{}{}, - }, - }, - { - Name: "wb_app_list", - Description: "List all applications in the workspace", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]interface{}{ - "format": map[string]interface{}{ - "type": "string", - "description": "Output format (json, text)", - "enum": []string{"json", "text"}, + } + if operator != "IS_NULL" && operator != "IS_NOT_NULL" { + values := []interface{}{} + if val, ok := params.Arguments["value"]; ok { + values = append(values, buildLiteral(dataType, val)) + } + if vals, ok := params.Arguments["values"].([]interface{}); ok { + for _, v := range vals { + values = append(values, buildLiteral(dataType, v)) + } + } + filter["filterUnion"].(map[string]interface{})["attributeFilter"].(map[string]interface{})["values"] = values + } + outputBytes, _ := json.MarshalIndent(filter, "", " ") + output = string(outputBytes) + + case "filter_build_relationship": + relatedEntity, ok := params.Arguments["relatedEntity"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'relatedEntity' required"}}, IsError: true} + } + filter := map[string]interface{}{ + "filterType": "RELATIONSHIP", + "filterUnion": map[string]interface{}{ + "relationshipFilter": map[string]interface{}{ + "entity": relatedEntity, }, }, - }, - }, - { - Name: "wb_app_describe", - Description: "Describe a specific application", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]interface{}{ - "name": map[string]interface{}{ - "type": "string", - "description": "Name or ID of the application", + } + if subfilter, ok := params.Arguments["subfilter"].(map[string]interface{}); ok { + filter["filterUnion"].(map[string]interface{})["relationshipFilter"].(map[string]interface{})["subfilter"] = subfilter + } + outputBytes, _ := json.MarshalIndent(filter, "", " ") + output = string(outputBytes) + + case "filter_build_boolean_logic": + operator, ok := params.Arguments["operator"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'operator' required"}}, IsError: true} + } + subfilters, ok := params.Arguments["subfilters"].([]interface{}) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'subfilters' required"}}, IsError: true} + } + filter := map[string]interface{}{ + "filterType": "BOOLEAN_LOGIC", + "filterUnion": map[string]interface{}{ + "booleanLogicFilter": map[string]interface{}{ + "operator": operator, + "subfilters": subfilters, }, }, - Required: []string{"name"}, - }, - }, - { - Name: "wb_execute", - Description: "Execute a custom wb command. Use this for commands not covered by other tools. Provide the full command without 'wb' prefix (e.g., 'workspace describe --id=123' not 'wb workspace describe --id=123')", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]interface{}{ - "command": map[string]interface{}{ - "type": "string", - "description": "The wb command to execute (without 'wb' prefix)", + } + outputBytes, _ := json.MarshalIndent(filter, "", " ") + output = string(outputBytes) + + case "filter_build_hierarchy": + hierarchy, ok := params.Arguments["hierarchy"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'hierarchy' required"}}, IsError: true} + } + operator, ok := params.Arguments["operator"].(string) + if !ok { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: 'operator' required"}}, IsError: true} + } + filter := map[string]interface{}{ + "filterType": "HIERARCHY", + "filterUnion": map[string]interface{}{ + "hierarchyFilter": map[string]interface{}{ + "hierarchy": hierarchy, + "operator": operator, }, }, - Required: []string{"command"}, - }, - }, -} + } + if values, ok := params.Arguments["values"].([]interface{}); ok { + filter["filterUnion"].(map[string]interface{})["hierarchyFilter"].(map[string]interface{})["values"] = values + } + outputBytes, _ := json.MarshalIndent(filter, "", " ") + output = string(outputBytes) -func executeWbCommand(args []string) (string, error) { - cmd := exec.Command("wb", args...) - output, err := cmd.CombinedOutput() - return string(output), err -} + case "workspace_create": + id := params.Arguments["id"].(string) + podId := params.Arguments["podId"].(string) + args := []string{"workspace", "create", "--id=" + id, "--pod=" + podId} + if name, ok := params.Arguments["name"].(string); ok { + args = append(args, "--name="+name) + } + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } + if orgId, ok := params.Arguments["organizationId"].(string); ok { + args = append(args, "--org="+orgId) + } + output, err = executeWbCommand(args) -func handleCallTool(params CallToolParams) CallToolResult { - var args []string - var output string - var err error + case "workspace_delete": + workspaceId := params.Arguments["workspaceId"].(string) + output, err = executeWbCommand([]string{"workspace", "delete", "--workspace=" + workspaceId}) - switch params.Name { - case "wb_status": - args = []string{"status"} + case "workspace_update": + workspaceId := params.Arguments["workspaceId"].(string) + args := []string{"workspace", "update", "--workspace=" + workspaceId} + if name, ok := params.Arguments["name"].(string); ok { + args = append(args, "--name="+name) + } + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } output, err = executeWbCommand(args) - case "wb_workspace_list": - args = []string{"workspace", "list"} - if format, ok := params.Arguments["format"].(string); ok && format == "json" { - args = append(args, "--format=json") + case "workspace_duplicate": + sourceId := params.Arguments["sourceWorkspaceId"].(string) + destId := params.Arguments["destWorkspaceId"].(string) + args := []string{"workspace", "duplicate", "--source-workspace=" + sourceId, "--destination-workspace-id=" + destId} + if name, ok := params.Arguments["name"].(string); ok { + args = append(args, "--name="+name) + } + output, err = executeWbCommand(args) + + case "workspace_set_property": + workspaceId := params.Arguments["workspaceId"].(string) + key := params.Arguments["key"].(string) + value := params.Arguments["value"].(string) + output, err = executeWbCommand([]string{"workspace", "set-property", "--workspace=" + workspaceId, "--key=" + key, "--value=" + value}) + + case "workspace_delete_property": + workspaceId := params.Arguments["workspaceId"].(string) + key := params.Arguments["key"].(string) + output, err = executeWbCommand([]string{"workspace", "delete-property", "--workspace=" + workspaceId, "--key=" + key}) + + case "workspace_add_user": + workspaceId := params.Arguments["workspaceId"].(string) + email := params.Arguments["email"].(string) + role := params.Arguments["role"].(string) + output, err = executeWbCommand([]string{"workspace", "add-user", "--workspace=" + workspaceId, "--email=" + email, "--role=" + role}) + + case "workspace_remove_user": + workspaceId := params.Arguments["workspaceId"].(string) + email := params.Arguments["email"].(string) + output, err = executeWbCommand([]string{"workspace", "remove-user", "--workspace=" + workspaceId, "--email=" + email}) + + case "workspace_list_users": + workspaceId := params.Arguments["workspaceId"].(string) + output, err = executeWbCommand([]string{"workspace", "list-users", "--workspace=" + workspaceId}) + + case "resource_create_bucket": + resourceId := params.Arguments["resourceId"].(string) + bucketName := params.Arguments["bucketName"].(string) + args := []string{"resource", "create", "gcs-bucket", "--id=" + resourceId, "--bucket-name=" + bucketName} + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) } output, err = executeWbCommand(args) - case "wb_resource_list": - args = []string{"resource", "list"} - if resourceType, ok := params.Arguments["type"].(string); ok && resourceType != "" { - args = append(args, "--type="+resourceType) + case "resource_create_bq_dataset": + resourceId := params.Arguments["resourceId"].(string) + datasetId := params.Arguments["datasetId"].(string) + args := []string{"resource", "create", "bq-dataset", "--id=" + resourceId, "--dataset-id=" + datasetId} + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) } - if format, ok := params.Arguments["format"].(string); ok && format == "json" { - args = append(args, "--format=json") + output, err = executeWbCommand(args) + + case "resource_delete": + resourceId := params.Arguments["resourceId"].(string) + output, err = executeWbCommand([]string{"resource", "delete", "--name=" + resourceId}) + + case "resource_update": + resourceId := params.Arguments["resourceId"].(string) + args := []string{"resource", "update", "--name=" + resourceId} + if name, ok := params.Arguments["name"].(string); ok { + args = append(args, "--new-name="+name) + } + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) } output, err = executeWbCommand(args) - case "wb_resource_describe": - name, ok := params.Arguments["name"].(string) - if !ok { - return CallToolResult{ - Content: []ContentItem{{Type: "text", Text: "Error: 'name' parameter is required"}}, - IsError: true, - } + case "resource_add_reference": + resourceId := params.Arguments["resourceId"].(string) + resourceType := params.Arguments["resourceType"].(string) + path := params.Arguments["path"].(string) + args := []string{"resource", "add-ref", resourceType, "--name=" + resourceId, "--path=" + path} + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) } - args = []string{"resource", "describe", "--name=" + name} - if format, ok := params.Arguments["format"].(string); ok && format == "json" { - args = append(args, "--format=json") + output, err = executeWbCommand(args) + + case "resource_check_access": + resourceId := params.Arguments["resourceId"].(string) + output, err = executeWbCommand([]string{"resource", "check-access", "--name=" + resourceId}) + + case "resource_move": + resourceId := params.Arguments["resourceId"].(string) + folderId := params.Arguments["folderId"].(string) + output, err = executeWbCommand([]string{"resource", "move", "--name=" + resourceId, "--folder-id=" + folderId}) + + case "folder_create": + folderId := params.Arguments["folderId"].(string) + displayName := params.Arguments["displayName"].(string) + args := []string{"folder", "create", "--id=" + folderId, "--display-name=" + displayName} + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } + if parentId, ok := params.Arguments["parentId"].(string); ok { + args = append(args, "--parent-folder-id="+parentId) } output, err = executeWbCommand(args) - case "wb_folder_tree": - args = []string{"folder", "tree"} + case "folder_delete": + folderId := params.Arguments["folderId"].(string) + output, err = executeWbCommand([]string{"folder", "delete", "--id=" + folderId}) + + case "folder_update": + folderId := params.Arguments["folderId"].(string) + args := []string{"folder", "update", "--id=" + folderId} + if displayName, ok := params.Arguments["displayName"].(string); ok { + args = append(args, "--display-name="+displayName) + } + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) + } output, err = executeWbCommand(args) - case "wb_app_list": - args = []string{"app", "list"} - if format, ok := params.Arguments["format"].(string); ok && format == "json" { - args = append(args, "--format=json") + case "folder_list_tree": + output, err = executeWbCommand([]string{"folder", "tree"}) + + case "group_create": + groupId := params.Arguments["groupId"].(string) + name := params.Arguments["name"].(string) + args := []string{"group", "create", "--id=" + groupId, "--name=" + name} + if desc, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+desc) } output, err = executeWbCommand(args) - case "wb_app_describe": - name, ok := params.Arguments["name"].(string) - if !ok { - return CallToolResult{ - Content: []ContentItem{{Type: "text", Text: "Error: 'name' parameter is required"}}, - IsError: true, - } + case "group_delete": + groupId := params.Arguments["groupId"].(string) + output, err = executeWbCommand([]string{"group", "delete", "--id=" + groupId}) + + case "group_list": + output, err = executeWbCommand([]string{"group", "list"}) + + case "group_describe": + groupId := params.Arguments["groupId"].(string) + output, err = executeWbCommand([]string{"group", "describe", "--id=" + groupId}) + + case "group_add_user": + groupId := params.Arguments["groupId"].(string) + email := params.Arguments["email"].(string) + role := params.Arguments["role"].(string) + output, err = executeWbCommand([]string{"group", "member", "add", "--group-id=" + groupId, "--email=" + email, "--role=" + role}) + + case "group_remove_user": + groupId := params.Arguments["groupId"].(string) + email := params.Arguments["email"].(string) + output, err = executeWbCommand([]string{"group", "member", "remove", "--group-id=" + groupId, "--email=" + email}) + + case "app_create": + appId := params.Arguments["appId"].(string) + appConfig := params.Arguments["appConfig"].(string) + args := []string{"app", "create", "gcp", "--id=" + appId, "--config=" + appConfig} + if machineType, ok := params.Arguments["machineType"].(string); ok { + args = append(args, "--machine-type="+machineType) + } + if description, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+description) + } + if location, ok := params.Arguments["location"].(string); ok { + args = append(args, "--location="+location) } - args = []string{"app", "describe", "--name=" + name} output, err = executeWbCommand(args) - case "wb_execute": - command, ok := params.Arguments["command"].(string) - if !ok { - return CallToolResult{ - Content: []ContentItem{{Type: "text", Text: "Error: 'command' parameter is required"}}, - IsError: true, - } + case "app_delete": + appId := params.Arguments["appId"].(string) + output, err = executeWbCommand([]string{"app", "delete", "--id=" + appId, "--quiet"}) + + case "app_list": + output, err = executeWbCommand([]string{"app", "list"}) + + case "app_start": + appId := params.Arguments["appId"].(string) + output, err = executeWbCommand([]string{"app", "start", "--id=" + appId}) + + case "app_stop": + appId := params.Arguments["appId"].(string) + output, err = executeWbCommand([]string{"app", "stop", "--id=" + appId}) + + case "app_get_url": + appId := params.Arguments["appId"].(string) + output, err = executeWbCommand([]string{"app", "launch", "--id=" + appId}) + + case "auth_status": + output, err = executeWbCommand([]string{"auth", "status"}) + + case "server_list": + output, err = executeWbCommand([]string{"server", "list"}) + + case "server_set": + serverName := params.Arguments["serverName"].(string) + output, err = executeWbCommand([]string{"server", "set", "--name=" + serverName}) + + case "server_status": + output, err = executeWbCommand([]string{"server", "status"}) + + case "server_list_regions": + cloudPlatform := params.Arguments["cloudPlatform"].(string) + output, err = executeWbCommand([]string{"server", "list-regions", "--platform=" + cloudPlatform}) + + case "pod_list": + output, err = executeWbCommand([]string{"pod", "list"}) + + case "pod_describe": + podId := params.Arguments["podId"].(string) + output, err = executeWbCommand([]string{"pod", "describe", "--id=" + podId}) + + case "pod_role_list": + organizationId := params.Arguments["organizationId"].(string) + podId := params.Arguments["podId"].(string) + output, err = executeWbCommand([]string{"pod", "role", "list", "--organization=" + organizationId, "--pod=" + podId}) + + case "pod_role_grant": + organizationId := params.Arguments["organizationId"].(string) + podId := params.Arguments["podId"].(string) + email := params.Arguments["email"].(string) + role := params.Arguments["role"].(string) + output, err = executeWbCommand([]string{"pod", "role", "grant", "user", "--organization=" + organizationId, "--pod=" + podId, "--email=" + email, "--role=" + role}) + + case "pod_role_revoke": + organizationId := params.Arguments["organizationId"].(string) + podId := params.Arguments["podId"].(string) + email := params.Arguments["email"].(string) + role := params.Arguments["role"].(string) + output, err = executeWbCommand([]string{"pod", "role", "revoke", "user", "--organization=" + organizationId, "--pod=" + podId, "--email=" + email, "--role=" + role}) + + case "organization_list": + output, err = executeWbCommand([]string{"organization", "list"}) + + case "resource_credentials": + resourceId := params.Arguments["resourceId"].(string) + args := []string{"resource", "credentials", "--name=" + resourceId} + if duration, ok := params.Arguments["duration"].(float64); ok { + args = append(args, fmt.Sprintf("--duration=%d", int(duration))) } - args = strings.Fields(command) output, err = executeWbCommand(args) - default: - return CallToolResult{ - Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Unknown tool: %s", params.Name)}}, - IsError: true, + case "resource_open_console": + resourceId := params.Arguments["resourceId"].(string) + output, err = executeWbCommand([]string{"resource", "open-console", "--name=" + resourceId}) + + case "resource_list_tree": + output, err = executeWbCommand([]string{"resource", "list-tree"}) + + case "resource_mount": + output, err = executeWbCommand([]string{"resource", "mount"}) + + case "resource_unmount": + output, err = executeWbCommand([]string{"resource", "unmount"}) + + case "notebook_start": + notebookId := params.Arguments["notebookId"].(string) + output, err = executeWbCommand([]string{"notebook", "start", "--id=" + notebookId}) + + case "notebook_stop": + notebookId := params.Arguments["notebookId"].(string) + output, err = executeWbCommand([]string{"notebook", "stop", "--id=" + notebookId}) + + case "notebook_launch": + notebookId := params.Arguments["notebookId"].(string) + output, err = executeWbCommand([]string{"notebook", "launch", "--id=" + notebookId}) + + case "cluster_start": + clusterId := params.Arguments["clusterId"].(string) + output, err = executeWbCommand([]string{"cluster", "start", "--id=" + clusterId}) + + case "cluster_stop": + clusterId := params.Arguments["clusterId"].(string) + output, err = executeWbCommand([]string{"cluster", "stop", "--id=" + clusterId}) + + case "cluster_launch": + clusterId := params.Arguments["clusterId"].(string) + output, err = executeWbCommand([]string{"cluster", "launch", "--id=" + clusterId}) + + case "workflow_list": + workspaceId := params.Arguments["workspaceId"].(string) + output, err = executeWbCommand([]string{"workflow", "list", "--workspace=" + workspaceId}) + + case "workflow_create": + workspaceId := params.Arguments["workspaceId"].(string) + workflowId := params.Arguments["workflowId"].(string) + bucketId := params.Arguments["bucketId"].(string) + path := params.Arguments["path"].(string) + args := []string{"workflow", "create", "--workspace=" + workspaceId, "--workflow=" + workflowId, "--bucket-id=" + bucketId, "--path=" + path} + if displayName, ok := params.Arguments["displayName"].(string); ok { + args = append(args, "--display-name="+displayName) + } + if description, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+description) + } + output, err = executeWbCommand(args) + + case "workflow_describe": + workspaceId := params.Arguments["workspaceId"].(string) + workflowId := params.Arguments["workflowId"].(string) + output, err = executeWbCommand([]string{"workflow", "describe", "--workspace=" + workspaceId, "--workflow=" + workflowId}) + + case "workflow_job_list": + output, err = executeWbCommand([]string{"workflow", "job", "list"}) + + case "workflow_job_describe": + workspaceId := params.Arguments["workspaceId"].(string) + jobId := params.Arguments["jobId"].(string) + output, err = executeWbCommand([]string{"workflow", "job", "describe", "--workspace=" + workspaceId, "--job-id=" + jobId}) + + case "workflow_job_run": + workspaceId := params.Arguments["workspaceId"].(string) + workflowId := params.Arguments["workflowId"].(string) + outputBucketId := params.Arguments["outputBucketId"].(string) + args := []string{"workflow", "job", "run", "--workspace=" + workspaceId, "--workflow=" + workflowId, "--output-bucket-id=" + outputBucketId} + if jobId, ok := params.Arguments["jobId"].(string); ok { + args = append(args, "--job-id="+jobId) + } + if description, ok := params.Arguments["description"].(string); ok { + args = append(args, "--description="+description) + } + if outputPath, ok := params.Arguments["outputPath"].(string); ok { + args = append(args, "--output-path="+outputPath) } + if inputs, ok := params.Arguments["inputs"].(map[string]interface{}); ok { + inputsJSON, _ := json.Marshal(inputs) + args = append(args, "--inputs="+string(inputsJSON)) + } + output, err = executeWbCommand(args) + + case "workflow_job_cancel": + workspaceId := params.Arguments["workspaceId"].(string) + jobId := params.Arguments["jobId"].(string) + output, err = executeWbCommand([]string{"workflow", "job", "cancel", "--workspace=" + workspaceId, "--job-id=" + jobId}) + + case "cromwell_generate_config": + path := params.Arguments["path"].(string) + output, err = executeWbCommand([]string{"cromwell", "generate-config", "--path=" + path}) + + case "workspace_configure_aws": + workspaceId := params.Arguments["workspaceId"].(string) + output, err = executeWbCommand([]string{"workspace", "configure-aws", "--workspace=" + workspaceId}) + + case "resolve": + resourceId := params.Arguments["resourceId"].(string) + output, err = executeWbCommand([]string{"resolve", "--name=" + resourceId}) + + case "version": + output, err = executeWbCommand([]string{"version"}) + + case "bq_execute": + command := params.Arguments["command"].(string) + output, err = executeWbCommand(append([]string{"bq"}, strings.Fields(command)...)) + + case "gcloud_execute": + command := params.Arguments["command"].(string) + output, err = executeWbCommand(append([]string{"gcloud"}, strings.Fields(command)...)) + + case "gsutil_execute": + command := params.Arguments["command"].(string) + output, err = executeWbCommand(append([]string{"gsutil"}, strings.Fields(command)...)) + + case "git_execute": + command := params.Arguments["command"].(string) + output, err = executeWbCommand(append([]string{"git"}, strings.Fields(command)...)) + + default: + return CallToolResult{Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Unknown tool: %s", params.Name)}}, IsError: true} } if err != nil { - return CallToolResult{ - Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Command failed: %s\nOutput: %s", err.Error(), output)}}, - IsError: true, - } + return CallToolResult{Content: []ContentItem{{Type: "text", Text: fmt.Sprintf("Error: %s", err.Error())}}, IsError: true} } + return CallToolResult{Content: []ContentItem{{Type: "text", Text: output}}, IsError: false} +} - return CallToolResult{ - Content: []ContentItem{{Type: "text", Text: output}}, - IsError: false, +func buildLiteral(dataType string, value interface{}) map[string]interface{} { + literal := map[string]interface{}{"dataType": dataType, "valueUnion": map[string]interface{}{}} + switch dataType { + case "BOOLEAN": + literal["valueUnion"].(map[string]interface{})["boolVal"] = value + case "INT64": + literal["valueUnion"].(map[string]interface{})["int64Val"] = fmt.Sprintf("%v", value) + case "STRING": + literal["valueUnion"].(map[string]interface{})["stringVal"] = fmt.Sprintf("%v", value) + case "DATE": + literal["valueUnion"].(map[string]interface{})["dateVal"] = fmt.Sprintf("%v", value) + case "TIMESTAMP": + literal["valueUnion"].(map[string]interface{})["timestampVal"] = fmt.Sprintf("%v", value) + case "DOUBLE": + literal["valueUnion"].(map[string]interface{})["doubleVal"] = value } + return literal } func handleRequest(req JSONRPCRequest) JSONRPCResponse { switch req.Method { case "initialize": - var params InitializeParams - if req.Params != nil { - json.Unmarshal(req.Params, ¶ms) - } - return JSONRPCResponse{ JSONRPC: "2.0", ID: req.ID, Result: InitializeResult{ ProtocolVersion: "2024-11-05", - Capabilities: map[string]interface{}{ - "tools": map[string]interface{}{}, - }, - ServerInfo: ServerInfo{ - Name: "wb-mcp-server", - Version: "1.0.0", - }, + Capabilities: map[string]interface{}{"tools": map[string]interface{}{}}, + ServerInfo: ServerInfo{Name: "wb-mcp-server", Version: "2.0.0"}, }, } - case "tools/list": - return JSONRPCResponse{ - JSONRPC: "2.0", - ID: req.ID, - Result: ListToolsResult{ - Tools: wbTools, - }, - } - + return JSONRPCResponse{JSONRPC: "2.0", ID: req.ID, Result: ListToolsResult{Tools: wbTools}} case "tools/call": var params CallToolParams if err := json.Unmarshal(req.Params, ¶ms); err != nil { - return JSONRPCResponse{ - JSONRPC: "2.0", - ID: req.ID, - Error: &RPCError{ - Code: -32602, - Message: "Invalid params: " + err.Error(), - }, - } - } - - result := handleCallTool(params) - return JSONRPCResponse{ - JSONRPC: "2.0", - ID: req.ID, - Result: result, + return JSONRPCResponse{JSONRPC: "2.0", ID: req.ID, Error: &RPCError{Code: -32602, Message: "Invalid params"}} } - + return JSONRPCResponse{JSONRPC: "2.0", ID: req.ID, Result: handleCallTool(params)} default: - return JSONRPCResponse{ - JSONRPC: "2.0", - ID: req.ID, - Error: &RPCError{ - Code: -32601, - Message: "Method not found: " + req.Method, - }, - } + return JSONRPCResponse{JSONRPC: "2.0", ID: req.ID, Error: &RPCError{Code: -32601, Message: "Method not found"}} } } func main() { - fmt.Fprintln(os.Stderr, "Workbench MCP Server v1.0.0 starting...") - fmt.Fprintln(os.Stderr, "Reading from stdin, writing to stdout") + fmt.Fprintln(os.Stderr, "Workbench MCP Server v2.0 starting...") + + if err := initializeConfig(); err != nil { + fmt.Fprintf(os.Stderr, "Error initializing: %v\n", err) + os.Exit(1) + } + + fmt.Fprintf(os.Stderr, "Ready - %d tools available\n", len(wbTools)) scanner := bufio.NewScanner(os.Stdin) for scanner.Scan() { @@ -371,21 +2720,11 @@ func main() { var req JSONRPCRequest if err := json.Unmarshal([]byte(line), &req); err != nil { - fmt.Fprintln(os.Stderr, "Error parsing request:", err) continue } response := handleRequest(req) - responseBytes, err := json.Marshal(response) - if err != nil { - fmt.Fprintln(os.Stderr, "Error marshaling response:", err) - continue - } - + responseBytes, _ := json.Marshal(response) fmt.Println(string(responseBytes)) } - - if err := scanner.Err(); err != nil { - fmt.Fprintln(os.Stderr, "Error reading input:", err) - } } From 290edff5179a908463384f9c6d43dabe34c5f2fc Mon Sep 17 00:00:00 2001 From: Navid Zolghadr Date: Wed, 14 Jan 2026 17:33:58 -0500 Subject: [PATCH 04/86] Autoadd wb mcp to claude and gemini --- features/src/wb-mcp-server/install.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/features/src/wb-mcp-server/install.sh b/features/src/wb-mcp-server/install.sh index 46a059e2f..43a94bea0 100755 --- a/features/src/wb-mcp-server/install.sh +++ b/features/src/wb-mcp-server/install.sh @@ -135,6 +135,18 @@ EOF # Make the directory and files accessible to the user chown -R "${USERNAME}:" "${WB_MCP_DIR}" +# Auto-configure Claude CLI if available +if command -v claude &> /dev/null; then + echo "Found Claude CLI, attempting to add MCP server..." + su - "${USERNAME}" -c "claude mcp add wb ${WB_MCP_BIN}" 2>/dev/null || true +fi + +# Auto-configure Gemini CLI if available +if command -v gemini &> /dev/null; then + echo "Found Gemini CLI, attempting to add MCP server..." + su - "${USERNAME}" -c "gemini mcp add wb ${WB_MCP_BIN}" 2>/dev/null || true +fi + # Add environment variables and PATH to .bashrc { echo "" From b1eb1dcd12772717124909b549ea9ae3e21ac65b Mon Sep 17 00:00:00 2001 From: Navid Zolghadr Date: Wed, 14 Jan 2026 17:48:10 -0500 Subject: [PATCH 05/86] Add a new jupyter lab with llm tools --- .../wb-mcp-server/devcontainer-feature.json | 4 +- features/src/wb-mcp-server/install.sh | 4 +- .../.devcontainer.json | 72 +++++++++++++++++++ src/workbench-jupyter-with-llm/Dockerfile | 5 ++ src/workbench-jupyter-with-llm/README.md | 30 ++++++++ .../devcontainer-template.json | 23 ++++++ .../docker-compose.yaml | 26 +++++++ 7 files changed, 161 insertions(+), 3 deletions(-) create mode 100644 src/workbench-jupyter-with-llm/.devcontainer.json create mode 100644 src/workbench-jupyter-with-llm/Dockerfile create mode 100644 src/workbench-jupyter-with-llm/README.md create mode 100644 src/workbench-jupyter-with-llm/devcontainer-template.json create mode 100644 src/workbench-jupyter-with-llm/docker-compose.yaml diff --git a/features/src/wb-mcp-server/devcontainer-feature.json b/features/src/wb-mcp-server/devcontainer-feature.json index ea17f8721..0bd4393cc 100644 --- a/features/src/wb-mcp-server/devcontainer-feature.json +++ b/features/src/wb-mcp-server/devcontainer-feature.json @@ -22,6 +22,8 @@ }, "installsAfter": [ "ghcr.io/devcontainers/features/common-utils", - "ghcr.io/devcontainers/features/go" + "ghcr.io/devcontainers/features/go", + "ghcr.io/anthropics/devcontainer-features/claude-code", + "./.devcontainer/features/gemini" ] } diff --git a/features/src/wb-mcp-server/install.sh b/features/src/wb-mcp-server/install.sh index 43a94bea0..6dcbf68a8 100755 --- a/features/src/wb-mcp-server/install.sh +++ b/features/src/wb-mcp-server/install.sh @@ -138,13 +138,13 @@ chown -R "${USERNAME}:" "${WB_MCP_DIR}" # Auto-configure Claude CLI if available if command -v claude &> /dev/null; then echo "Found Claude CLI, attempting to add MCP server..." - su - "${USERNAME}" -c "claude mcp add wb ${WB_MCP_BIN}" 2>/dev/null || true + su - "${USERNAME}" -c "claude mcp add --transport stdio wb -- ${WB_MCP_BIN}" 2>/dev/null || true fi # Auto-configure Gemini CLI if available if command -v gemini &> /dev/null; then echo "Found Gemini CLI, attempting to add MCP server..." - su - "${USERNAME}" -c "gemini mcp add wb ${WB_MCP_BIN}" 2>/dev/null || true + su - "${USERNAME}" -c "gemini mcp add --scope user wb ${WB_MCP_BIN}" 2>/dev/null || true fi # Add environment variables and PATH to .bashrc diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json new file mode 100644 index 000000000..0930e0321 --- /dev/null +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -0,0 +1,72 @@ +{ + "name": "Navid Workbench Jupyter", + "dockerComposeFile": ["docker-compose.yaml", "../jupyter-common/jupyter-common-compose.yaml"], + "service": "app", + "runServices": ["app"], + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": [ + "./startupscript/post-startup.sh", + "jupyter", + "/home/jupyter", + "${templateOption:cloud}", + "${templateOption:login}" + ], + // re-mount bucket files on container start up + "postStartCommand": [ + "./startupscript/remount-on-restart.sh", + "jupyter", + "/home/jupyter", + "${templateOption:cloud}", + "${templateOption:login}" + ], + "features": { + "./.devcontainer/features/workbench-tools": { + "cloud": "${templateOption:cloud}", + "username": "jupyter", + "userHomeDir": "/home/jupyter" + }, + "./.devcontainer/features/gemini": { + "version": "latest", + "username": "jupyter", + "userHomeDir": "/home/jupyter" + }, + "ghcr.io/anthropics/devcontainer-features/claude-code:1.0": {}, + "./.devcontainer/features/wb-mcp-server": { + "username": "jupyter", + "userHomeDir": "/home/jupyter" + } + }, + "remoteUser": "root", + "customizations": { + "workbench": { + "opens": { + "extensions": [ + // Source + ".ipynb", + ".R", + ".py", + // Documents + ".md", + ".html", + ".latex", + ".pdf", + // Images + ".bmp", + ".gif", + ".jpeg", + ".jpg", + ".png", + ".svg", + // Data + ".csv", + ".tsv", + ".json", + ".vl" + ], + "fileUrlSuffix": "/lab/tree/{path}", + "folderUrlSuffix": "/lab/tree/{path}" + } + } + } +} diff --git a/src/workbench-jupyter-with-llm/Dockerfile b/src/workbench-jupyter-with-llm/Dockerfile new file mode 100644 index 000000000..bc736fac9 --- /dev/null +++ b/src/workbench-jupyter-with-llm/Dockerfile @@ -0,0 +1,5 @@ +FROM us-west2-docker.pkg.dev/shared-pub-buckets-94mvrf/workbench-artifacts/app-workbench-jupyter@sha256:8261521e5433b6997c4b323c4b391b02ea3fc3f059e33ccedc36af2554ef70f9 + +# Install jupyter extensions +RUN --mount=type=bind,from=jupyter-extension-builder,source=/dist,target=/tmp/extensions \ + /tmp/extensions/setup.sh diff --git a/src/workbench-jupyter-with-llm/README.md b/src/workbench-jupyter-with-llm/README.md new file mode 100644 index 000000000..2966e9099 --- /dev/null +++ b/src/workbench-jupyter-with-llm/README.md @@ -0,0 +1,30 @@ + +# Navid Workbench Jupyter (navid-workbench-jupyter) + +Workbench JupyterLab with integrated AI assistance through Gemini CLI, Claude CLI, and MCP server support for enhanced development capabilities. + +## Options + +| Options Id | Description | Type | Default Value | +|-----|-----|-----|-----| +| cloud | VM cloud environment | string | gcp | +| login | Whether to log in to workbench CLI | string | false | +| containerImage | The container image to use | string | debian:bullseye | +| containerPort | The port to expose the container on | number | 8888 | + + + +## Features + +This template includes the following integrated features: + +- **Workbench Tools** - Common bioinformatics and genomics tools +- **Gemini CLI** - Google Gemini AI assistant with MCP support +- **Claude CLI** - Anthropic Claude AI assistant (from ghcr.io/anthropics/devcontainer-features/claude-code:1.0) +- **WB MCP Server** - Workbench Model Context Protocol server for AI tool integration with workspace context + +All AI assistants are pre-configured to work with the Workbench MCP server for enhanced workspace awareness. + +--- + +_Note: This file was auto-generated from the [devcontainer-template.json](devcontainer-template.json). Add additional notes to a `NOTES.md`._ diff --git a/src/workbench-jupyter-with-llm/devcontainer-template.json b/src/workbench-jupyter-with-llm/devcontainer-template.json new file mode 100644 index 000000000..cfe1e7864 --- /dev/null +++ b/src/workbench-jupyter-with-llm/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "navid-workbench-jupyter", + "description": "Workbench JupyterLab with Gemini, Claude CLI, and MCP server integration", + "version": "0.0.1", + "name": "Navid Workbench Jupyter", + "documentationURL": "https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/navid-workbench-jupyter", + "licenseURL": "https://github.com/verily-src/workbench-app-devcontainers/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/src/workbench-jupyter-with-llm/docker-compose.yaml b/src/workbench-jupyter-with-llm/docker-compose.yaml new file mode 100644 index 000000000..18f191280 --- /dev/null +++ b/src/workbench-jupyter-with-llm/docker-compose.yaml @@ -0,0 +1,26 @@ +include: + - ../jupyter-common/jupyter-common-compose.yaml +services: + app: + container_name: "application-server" + build: + context: . + additional_contexts: + jupyter-extension-builder: service:jupyter-common-extension-builder + user: "jupyter" + restart: always + volumes: + - .:/workspace:cached + ports: + - "8888:8888" + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined +networks: + app-network: + external: true From 5806fdd8d81d2b7d8f16609b14f8e7bea775b907 Mon Sep 17 00:00:00 2001 From: Navid Zolghadr Date: Fri, 16 Jan 2026 18:56:51 -0500 Subject: [PATCH 06/86] Fix the gemini cli issue --- features/src/wb-mcp-server/main.go | 11 +++++++++-- src/workbench-jupyter-with-llm/.devcontainer.json | 8 +++----- src/workbench-jupyter-with-llm/README.md | 2 +- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index 6ed36c5cd..82298eadc 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -2688,6 +2688,10 @@ func handleRequest(req JSONRPCRequest) JSONRPCResponse { ServerInfo: ServerInfo{Name: "wb-mcp-server", Version: "2.0.0"}, }, } + case "notifications/initialized": + // Client sends this notification after receiving initialize response + // No response needed for notifications + return JSONRPCResponse{} case "tools/list": return JSONRPCResponse{JSONRPC: "2.0", ID: req.ID, Result: ListToolsResult{Tools: wbTools}} case "tools/call": @@ -2724,7 +2728,10 @@ func main() { } response := handleRequest(req) - responseBytes, _ := json.Marshal(response) - fmt.Println(string(responseBytes)) + // Only send response if there's a result or error (skip empty responses for notifications) + if response.Result != nil || response.Error != nil { + responseBytes, _ := json.Marshal(response) + fmt.Println(string(responseBytes)) + } } } diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json index 0930e0321..98c241d3e 100644 --- a/src/workbench-jupyter-with-llm/.devcontainer.json +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -6,11 +6,9 @@ "shutdownAction": "none", "workspaceFolder": "/workspace", "postCreateCommand": [ - "./startupscript/post-startup.sh", - "jupyter", - "/home/jupyter", - "${templateOption:cloud}", - "${templateOption:login}" + "bash", + "-c", + "./startupscript/post-startup.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && echo '' >> /home/jupyter/.bashrc && echo '# Unset GOOGLE_CLOUD_PROJECT for Gemini CLI compatibility' >> /home/jupyter/.bashrc && echo 'unset GOOGLE_CLOUD_PROJECT' >> /home/jupyter/.bashrc" ], // re-mount bucket files on container start up "postStartCommand": [ diff --git a/src/workbench-jupyter-with-llm/README.md b/src/workbench-jupyter-with-llm/README.md index 2966e9099..b089454f8 100644 --- a/src/workbench-jupyter-with-llm/README.md +++ b/src/workbench-jupyter-with-llm/README.md @@ -1,5 +1,5 @@ -# Navid Workbench Jupyter (navid-workbench-jupyter) +# Workbench Jupyter with LLM tools Workbench JupyterLab with integrated AI assistance through Gemini CLI, Claude CLI, and MCP server support for enhanced development capabilities. From f704b3c022b2eb68895682e6073af5bcad266b43 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 10 Feb 2026 16:09:01 -0500 Subject: [PATCH 07/86] Add LLM Context Generator feature - New feature: llm-context - generates CLAUDE.md for Claude Code auto-discovery - Includes generate-context.sh script with embedded skill files - Auto-runs on container startup to provide workspace context - Updated workbench-jupyter-with-llm app to include the new feature The CLAUDE.md file includes: - Workspace metadata (name, ID, cloud platform, role) - Resource paths and environment variables - Data exploration cheatsheet - Data persistence guidance - MCP vs CLI usage guide - Custom app creation skill --- features/src/llm-context/README.md | 141 +++ .../src/llm-context/devcontainer-feature.json | 34 + features/src/llm-context/generate-context.sh | 1001 +++++++++++++++++ features/src/llm-context/install.sh | 114 ++ .../.devcontainer.json | 5 + src/workbench-jupyter-with-llm/README.md | 22 + 6 files changed, 1317 insertions(+) create mode 100644 features/src/llm-context/README.md create mode 100644 features/src/llm-context/devcontainer-feature.json create mode 100755 features/src/llm-context/generate-context.sh create mode 100644 features/src/llm-context/install.sh diff --git a/features/src/llm-context/README.md b/features/src/llm-context/README.md new file mode 100644 index 000000000..6ae4800b7 --- /dev/null +++ b/features/src/llm-context/README.md @@ -0,0 +1,141 @@ +# LLM Context Generator (llm-context) + +A devcontainer feature that generates context files for LLMs (Claude Code, Gemini CLI, etc.) to understand the current Workbench workspace. + +## What It Does + +When installed, this feature: + +1. **Generates `~/CLAUDE.md`** - A comprehensive context file that Claude Code auto-discovers +2. **Provides workspace context** - Current workspace, resources, workflows, and tools +3. **Includes skill files** - Detailed guides for specific tasks (e.g., creating custom apps) +4. **Sets up environment** - Aliases and variables for easy context regeneration + +## Usage + +### In a devcontainer.json + +```json +{ + "features": { + "ghcr.io/verily-src/workbench-app-devcontainers/llm-context:latest": { + "autorun": true + } + } +} +``` + +### Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `autorun` | boolean | `true` | Automatically generate context on container start | + +### Manual Generation + +```bash +# Generate/refresh context +generate-llm-context + +# Or use the alias +refresh-context + +# Or run directly +/opt/llm-context/generate-context.sh +``` + +## What Gets Generated + +### ~/CLAUDE.md + +Claude Code automatically reads `~/CLAUDE.md` on startup. This file includes: + +- **Workspace info**: Name, ID, cloud platform, your role +- **Resource summary**: All resources with cloud paths +- **Quick reference JSON**: Machine-readable resource paths and environment variables +- **Data exploration commands**: How to query BigQuery, list GCS files +- **Best practices**: Data persistence, cost awareness, MCP vs CLI guidance +- **Skill references**: Links to detailed guides for specific tasks + +### ~/.workbench/skills/ + +Detailed skill files for specific topics: + +| Skill | File | Description | +|-------|------|-------------| +| Custom Apps | `CUSTOM_APP.md` | How to create Workbench custom apps | + +## How Claude Code Discovers Context + +1. Claude Code checks for `~/CLAUDE.md` on startup +2. If found, it reads the file and uses it as initial context +3. The file references skill files that Claude reads on-demand + +## Dependencies + +This feature depends on: + +- **workbench-tools**: Provides the `wb` CLI for fetching workspace data +- **jq**: JSON processing (installed automatically if missing) + +## Example Output + +After running, you'll see: + +``` +========================================== + Workbench LLM Context Generator +========================================== + +[INFO] Checking prerequisites... +[INFO] Prerequisites OK +[INFO] Setting up directories... +[INFO] Installing skill files... +[INFO] Fetching workspace information... +[INFO] Fetching resources... +[INFO] Fetching workflows... +[INFO] Fetching apps... +[INFO] Generating CLAUDE.md... +[INFO] Created /home/jupyter/.workbench/CLAUDE.md +[INFO] Created symlink ~/CLAUDE.md → /home/jupyter/.workbench/CLAUDE.md + +[INFO] Context generation complete! + +Generated file: + - /home/jupyter/.workbench/CLAUDE.md + - ~/CLAUDE.md (symlink for auto-discovery) + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +✅ Claude Code will automatically discover ~/CLAUDE.md + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +``` + +## Troubleshooting + +### Context not generating? + +1. Check if workspace is set: `wb workspace describe` +2. If not authenticated: `wb auth login --mode=APP_DEFAULT_CREDENTIALS` +3. Then set workspace: `wb workspace set ` + +### Claude Code not seeing context? + +1. Ensure `~/CLAUDE.md` exists: `ls -la ~/CLAUDE.md` +2. Check it's not empty: `head ~/CLAUDE.md` +3. Restart Claude Code to re-read the file + +### Need to refresh after workspace changes? + +```bash +refresh-context +``` + +## Integration with MCP Server + +This feature works alongside the `wb-mcp-server` feature: + +- **CLAUDE.md**: Provides static context (workspace info, how-to guides) +- **MCP Server**: Provides dynamic tools (list resources, run queries in real-time) + +Together, they give LLMs full context AND active capabilities. diff --git a/features/src/llm-context/devcontainer-feature.json b/features/src/llm-context/devcontainer-feature.json new file mode 100644 index 000000000..6b4e3ee09 --- /dev/null +++ b/features/src/llm-context/devcontainer-feature.json @@ -0,0 +1,34 @@ +{ + "id": "llm-context", + "version": "1.0.0", + "name": "LLM Context Generator", + "description": "Generates CLAUDE.md context file for LLMs (Claude Code, etc.) with Workbench workspace information", + "documentationURL": "https://github.com/verily-src/workbench-app-devcontainers/tree/master/features/src/llm-context", + "options": { + "username": { + "type": "string", + "default": "root", + "description": "Username for the container user" + }, + "userHomeDir": { + "type": "string", + "default": "/root", + "description": "Home directory for the container user" + }, + "autorun": { + "type": "boolean", + "default": true, + "description": "Automatically generate context on container start" + } + }, + "containerEnv": { + "LLM_CONTEXT_ENABLED": "true" + }, + "installsAfter": [ + "ghcr.io/verily-src/workbench-app-devcontainers/workbench-tools" + ], + "dependsOn": { + "ghcr.io/verily-src/workbench-app-devcontainers/workbench-tools": {} + }, + "postStartCommand": "if [ \"${AUTORUN}\" = \"true\" ]; then /opt/llm-context/generate-context.sh || true; fi" +} diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh new file mode 100755 index 000000000..2ed3ea4ea --- /dev/null +++ b/features/src/llm-context/generate-context.sh @@ -0,0 +1,1001 @@ +#!/bin/bash +# +# Workbench LLM Context Generator +# +# This script generates a single CLAUDE.md file that provides LLMs (like +# Claude Code) with full context about the current Workbench workspace, +# resources, workflows, and available tools. The file includes embedded +# JSON for machine-readable data. +# +# Usage: ./generate-context.sh +# +# Prerequisites: +# - Workbench CLI (wb) installed and authenticated +# - jq installed for JSON processing +# - Active workspace set (wb workspace set ) +# +# CLI JSON Field Reference: +# Workspace (UFWorkspaceLight.java): +# - id: user-facing ID (e.g., "my-workspace") +# - uuid: UUID +# - name: display name +# - description +# - cloudPlatform: GCP or AWS +# - googleProjectId, awsAccountId +# - highestRole: OWNER, WRITER, READER +# - orgId, podId +# - userEmail +# - createdDate, lastUpdatedDate +# - properties: Map +# +# Resource (UFResource.java): +# - id: resource name +# - uuid +# - description +# - resourceType: GCS_BUCKET, BQ_DATASET, GIT_REPO, etc. +# - stewardshipType: CONTROLLED, REFERENCED +# - region +# - For GCS: bucketName, location +# - For BQ: projectId, datasetId +# +# Workflow (UFWorkflow.java): +# - id: name +# - workflowId: UUID +# - displayName +# - description +# - bucketSource or gitSource +# + +set -e + +# Configuration +CONTEXT_DIR="${HOME}/.workbench" +SKILLS_DIR="${CONTEXT_DIR}/skills" +CLAUDE_FILE="${CONTEXT_DIR}/CLAUDE.md" +# Visible symlink in home directory for Claude Code auto-discovery +VISIBLE_CLAUDE_SYMLINK="${HOME}/CLAUDE.md" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" >&2 +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" >&2 +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" >&2 +} + +# Check prerequisites +check_prerequisites() { + log_info "Checking prerequisites..." + + if ! command -v wb &> /dev/null; then + log_error "Workbench CLI (wb) not found. Please install it first." + exit 1 + fi + + if ! command -v jq &> /dev/null; then + log_error "jq is required but not found. Please install jq." + exit 1 + fi + + # Check if workspace is set + if ! wb workspace describe --format=json &> /dev/null; then + log_error "No workspace set or not authenticated. Please run:" + log_error " wb auth login --mode=APP_DEFAULT_CREDENTIALS" + log_error " wb workspace set " + log_error "" + log_error "Note: Use --mode=APP_DEFAULT_CREDENTIALS inside Workbench apps" + exit 1 + fi + + log_info "Prerequisites OK" +} + +# Create output directory +setup_directories() { + log_info "Setting up directories..." + mkdir -p "${CONTEXT_DIR}" + mkdir -p "${SKILLS_DIR}" +} + +# Install skill files (embedded - no network needed) +install_skills() { + log_info "Installing skill files..." + + # Create CUSTOM_APP.md skill (full version, embedded) + log_info "Creating CUSTOM_APP.md skill..." + cat > "${SKILLS_DIR}/CUSTOM_APP.md" << 'SKILL_EOF' +# Creating Custom Workbench Apps + +**Practical guide for creating simple, reliable Workbench apps.** + +> **When to use this guide:** For simple apps (Flask APIs, static sites, custom tools). +> For apps needing Workbench CLI, gcloud, or Jupyter, see the [full-featured approach](https://github.com/verily-src/workbench-app-devcontainers). + +## TL;DR - The Minimal Pattern That Works + +Workbench custom apps need exactly **three things**: +1. Container named `application-server` +2. Connected to `app-network` (external Docker network) +3. HTTP server on a port + +**That's it.** Everything else is optional (and often causes problems). + +--- + +## The Minimal Working Pattern (Copy This) + +### File 1: `.devcontainer.json` +```json +{ + "name": "Your App Name", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "remoteUser": "root" +} +``` + +### File 2: `docker-compose.yaml` +```yaml +services: + app: + container_name: "application-server" + build: + context: ../.. + dockerfile: src/YOUR-APP-NAME/Dockerfile + restart: always + ports: + - "8080:8080" + networks: + - app-network + +networks: + app-network: + external: true +``` + +### File 3: `Dockerfile` +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +COPY src/YOUR-APP-NAME/app/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY src/YOUR-APP-NAME/app/ . + +EXPOSE 8080 + +CMD ["python", "your_app.py"] +``` + +### File 4: `devcontainer-template.json` +```json +{ + "id": "your-app-name", + "description": "Your app description", + "version": "1.0.0", + "name": "Your App Name", + "options": {}, + "platforms": ["Any"] +} +``` + +--- + +## Directory Structure + +``` +src/YOUR-APP-NAME/ +├── .devcontainer.json +├── devcontainer-template.json +├── docker-compose.yaml +├── Dockerfile +├── README.md +└── app/ + ├── your_app.py + ├── requirements.txt + └── (other files) +``` + +--- + +## What NOT To Do (Lessons Learned) + +### DON'T use complex base images unless needed +❌ `workbench-jupyter` base image - Has its own startup config that conflicts with CMD overrides +✅ `python:3.11-slim` - Clean, simple, no surprises + +### DON'T use devcontainer features +❌ Features like `ghcr.io/dhoeric/features/google-cloud-cli` - Uses deprecated `apt-key`, fails on newer Debian +❌ Features like `workbench-tools` - Expect specific system packages +✅ Install what you need directly in the Dockerfile + +### DON'T use postCreateCommand/postStartCommand +❌ `./startupscript/post-startup.sh` - Expects specific user/home structure, may fail +✅ Self-contained Dockerfile with everything built in + +### DON'T use supervisor for multiple processes (unless truly needed) +❌ Supervisor + Jupyter + Flask - Complex, many failure points +✅ Single process serving everything (Flask can serve static files) + +### DON'T fight with Jupyter config +❌ Overriding CMD on workbench-jupyter image - Causes `root_dir`/`file_to_run` conflicts +✅ Don't use Jupyter at all if you don't need it + +--- + +## Flask App: Serve Static Files Directly + +If your app has a Flask backend + static HTML, just have Flask serve everything: + +```python +import os +from flask import Flask +from flask_cors import CORS + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +app = Flask(__name__, static_folder=SCRIPT_DIR, static_url_path='/static') +CORS(app) + +@app.route('/') +def serve_index(): + return app.send_static_file('index.html') + +# ... your other routes ... + +if __name__ == '__main__': + app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +**No separate HTTP server needed. No supervisor. One process.** + +--- + +## Common Errors and Fixes + +### Error: `apt-key: command not found` +**Cause:** Devcontainer feature uses deprecated apt-key on newer Debian +**Fix:** Remove the feature from .devcontainer.json, install directly in Dockerfile if needed + +### Error: `root_dir and file_to_run are incompatible` +**Cause:** Overriding CMD on workbench-jupyter base image conflicts with its config +**Fix:** Don't use workbench-jupyter. Use python:3.11-slim instead + +### Error: `supports_credentials in conjunction with origin '*'` +**Cause:** Flask-CORS config conflict +**Fix:** Just use `CORS(app)` with no options + +### Error: Container restart loop +**Cause:** Main process exits immediately +**Fix:** Make sure your CMD runs a long-lived process (Flask server, not a script that exits) + +### Error: `Application-server port is empty` +**Cause:** Container not exposing port correctly, or app crashing before binding +**Fix:** Check `docker logs application-server` to see the actual error + +--- + +## Deployment + +### Deploy to Workbench +In Workbench UI, create custom app with: +- **Repository:** `git@github.com:YOUR-ORG/YOUR-REPO.git` +- **Branch:** `your-branch` +- **Folder:** `src/YOUR-APP-NAME` + +### For faster deploys (optional): Push to GAR +```bash +# Build +cd src/YOUR-APP-NAME +docker compose build + +# Tag +export TAG="us-central1-docker.pkg.dev/PROJECT/REPO/NAME:$(date +'%Y%m%d')" +docker tag YOUR-APP-NAME-app:latest ${TAG} + +# Push +docker push ${TAG} + +# Update docker-compose.yaml to use image: instead of build: +``` + +--- + +## Local Testing + +```bash +# Create required network +docker network create app-network + +# Build and run +cd src/YOUR-APP-NAME +docker compose build +docker compose up + +# Access at http://localhost:8080 +``` + +--- + +## Debugging on VM + +```bash +# SSH to VM, then: +docker logs application-server --tail 100 +docker exec -it application-server /bin/sh +docker ps -a +``` + +--- + +## Reference Implementations + +All examples are in the public repo: https://github.com/verily-src/workbench-app-devcontainers + +| App | Description | Complexity | +|-----|-------------|------------| +| `src/playground/` | Multi-service app with Caddy | Simple | +| `src/vscode/` | VS Code Server on port 8443 | Pre-built image | +| `src/r-analysis/` | RStudio on port 8787 | Pre-built image | +| `src/workbench-jupyter/` | JupyterLab with Workbench tools | Full-featured | + +--- + +## When DO You Need Features? + +Sometimes you genuinely need the full-featured approach: + +| Need | Solution | +|------|----------| +| Workbench CLI (`wb`) | Use `workbench-tools` feature | +| LLM/MCP integration | Use `wb-mcp-server` feature | +| Pre-authenticated gcloud | Use `workbench-tools` feature | +| Jupyter notebooks | Use `workbench-jupyter` base image | + +**If you need these, accept the complexity.** But test thoroughly. + +--- + +## Key Insight + +The old guides suggested using `workbench-jupyter` base image + devcontainer features + startup scripts. This adds complexity that causes failures. + +The **playground pattern** proves you only need: +1. A container named `application-server` +2. On the `app-network` network +3. Serving HTTP on a port + +Everything else is optional convenience that often breaks. + +**When in doubt, simplify.** +SKILL_EOF +} + +# Fetch workspace information +fetch_workspace() { + log_info "Fetching workspace information..." + wb workspace describe --format=json 2>/dev/null || echo "{}" +} + +# Fetch resources +fetch_resources() { + log_info "Fetching resources..." + wb resource list --format=json 2>/dev/null || echo "[]" +} + +# Fetch workflows (may not exist in all workspaces) +fetch_workflows() { + log_info "Fetching workflows..." + wb workflow list --format=json 2>/dev/null || echo "[]" +} + +# Fetch apps +fetch_apps() { + log_info "Fetching apps..." + wb app list --format=json 2>/dev/null || echo "[]" +} + +# Generate embedded JSON (returns JSON to stdout, doesn't write to file) +generate_embedded_json() { + local resources="$1" + + # Generate resourcePaths map: resource name -> cloud path + local resource_paths=$(echo "$resources" | jq -c ' + map( + { + key: .id, + value: ( + if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" + elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" + elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" + elif .resourceType == "GIT_REPO" then .gitRepoUrl + elif .resourceType == "GCS_OBJECT" then "gs://\(.bucketName)/\(.objectName // "")" + else null + end + ) + } + ) | map(select(.value != null)) | from_entries + ') + + # Generate envVars map: WORKBENCH_ -> cloud path + local env_vars=$(echo "$resources" | jq -c ' + map( + { + key: ("WORKBENCH_" + (.id | gsub("-"; "_"))), + value: ( + if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" + elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" + elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" + elif .resourceType == "GIT_REPO" then .gitRepoUrl + elif .resourceType == "GCS_OBJECT" then "gs://\(.bucketName)/\(.objectName // "")" + else null + end + ) + } + ) | map(select(.value != null)) | from_entries + ') + + # Output compact JSON for embedding + jq -n \ + --argjson resource_paths "$resource_paths" \ + --argjson env_vars "$env_vars" \ + '{ + "resourcePaths": $resource_paths, + "envVars": $env_vars + }' +} + +# Generate bucket list for data persistence section +generate_bucket_list() { + local resources="$1" + + # Filter to only GCS_BUCKET resources + local buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "GCS_BUCKET")]' 2>/dev/null || echo "[]") + local count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") + + if [ "$count" -eq 0 ] || [ "$count" = "0" ]; then + echo "*No GCS buckets in this workspace.* Create one with:" + echo '```bash' + echo 'wb resource create gcs-bucket --name my-storage --description "Storage for results"' + echo '```' + return + fi + + echo "| Bucket Name | Resource ID | Description |" + echo "|-------------|-------------|-------------|" + + echo "$buckets" | jq -r '.[] | "| `gs://\(.bucketName // "unknown")/` | `\(.id // "—")` | \(.description // "—" | if . == "" then "—" else . end) |"' 2>/dev/null || true +} + +# Generate CLAUDE.md +generate_claude_md() { + log_info "Generating CLAUDE.md..." + + local workspace="$1" + local resources="$2" + local workflows="$3" + local apps="$4" + + # Extract workspace values - field names match UFWorkspaceLight.java + local ws_name=$(echo "$workspace" | jq -r '.name // "Unnamed Workspace"') + local ws_id=$(echo "$workspace" | jq -r '.id // "unknown"') + local ws_desc=$(echo "$workspace" | jq -r '.description // "No description"') + local ws_cloud=$(echo "$workspace" | jq -r '.cloudPlatform // "GCP"') + local ws_gcp_project=$(echo "$workspace" | jq -r '.googleProjectId // ""') + local ws_aws_account=$(echo "$workspace" | jq -r '.awsAccountId // ""') + local ws_role=$(echo "$workspace" | jq -r '.highestRole // "READER"') + local ws_user=$(echo "$workspace" | jq -r '.userEmail // "unknown"') + local ws_org=$(echo "$workspace" | jq -r '.orgId // ""') + local ws_server=$(echo "$workspace" | jq -r '.serverName // ""') + + # Determine project display + local project_display="$ws_gcp_project" + if [ -n "$ws_aws_account" ] && [ "$ws_aws_account" != "null" ] && [ "$ws_aws_account" != "" ]; then + project_display="$ws_aws_account" + fi + + # Generate dynamic sections + local embedded_json=$(generate_embedded_json "$resources") + local bucket_list=$(generate_bucket_list "$resources") + + # Write the file + cat > "${CLAUDE_FILE}" << EOF +# Workbench Context + +You are working inside **Verily Workbench**, a secure cloud-based research environment for biomedical data analysis. + +--- + +## What is Verily Workbench? + +Verily Workbench is a platform that enables researchers to: +- Access and analyze biomedical data (clinical, genomics, wearables, imaging) +- Run computational workflows at scale (WDL, Nextflow) +- Collaborate securely with governance and policy enforcement +- Use familiar tools (Jupyter, RStudio, VS Code) in the cloud + +--- + +## Current Workspace + +| Property | Value | +|----------|-------| +| **Name** | ${ws_name} | +| **ID** | \`${ws_id}\` | +| **Cloud Platform** | ${ws_cloud} | +| **Project/Account** | \`${project_display}\` | +| **Your Role** | ${ws_role} | +| **User** | ${ws_user} | +| **Organization** | ${ws_org:-"—"} | +| **Server** | ${ws_server:-"—"} | + +### Description +${ws_desc} + +--- + +## Key Concepts + +### Workspaces +A **workspace** is a secure container for your research project. It contains: +- **Resources**: Cloud assets like buckets, datasets, repos +- **Workflows**: Reproducible analysis pipelines +- **Apps**: Interactive compute environments (this app!) +- **Policies**: Access controls and constraints + +### Resources +Resources are cloud assets managed by Workbench: + +| Type | Description | CLI Create Command | +|------|-------------|-------------------| +| \`GCS_BUCKET\` | Google Cloud Storage bucket | \`wb resource create gcs-bucket\` | +| \`BQ_DATASET\` | BigQuery dataset | \`wb resource create bq-dataset\` | +| \`GIT_REPO\` | Git repository reference | \`wb resource add-ref git-repo\` | +| \`GCS_OBJECT\` | Individual GCS file reference | \`wb resource add-ref gcs-object\` | +| \`BQ_TABLE\` | BigQuery table reference | \`wb resource add-ref bq-table\` | + +**Environment Variables**: Each resource is available as \`\$WORKBENCH_\` (e.g., \`\$WORKBENCH_my_bucket\`). + +### Data Collections +Data collections are curated datasets in the Workbench catalog. When added to a workspace, their resources are cloned into **folders**. + +#### Identifying Resources from Data Collections + +Use the **MCP server** to find which data collection a resource came from: + +1. **Use the MCP \`get_resource\` tool** to get full resource metadata including lineage +2. The \`resourceLineage\` array contains: + - \`sourceWorkspaceId\`: UUID of the data collection + - \`sourceResourceId\`: UUID of the original resource + +**Example:** Ask "Use get_resource to get lineage for resource 'clinical-bq-dataset'" + +The response includes: +\`\`\`json +{ + "resourceLineage": [ + { "sourceWorkspaceId": "abc123-...", "sourceResourceId": "def456-..." } + ] +} +\`\`\` + +### Workflows +Workflows are reproducible pipelines in WDL or Nextflow format, registered in the workspace. + +--- + +## ⚠️ Important: Data Persistence + +> **LOCAL STORAGE IS EPHEMERAL.** Files saved on this app's local disk will be **lost** when the app stops or restarts. + +### The Problem +Users often create files locally (notebooks, scripts, outputs) without realizing they won't persist. When the app shuts down, all local work is lost. + +### The Solution +**Save important files to a cloud bucket** in your workspace. Buckets persist independently of apps. + +### Available Storage Buckets in This Workspace + +${bucket_list} + +### How to Save Local Files to Cloud Storage + +\`\`\`bash +# Copy a single file to a bucket +gsutil cp my-notebook.ipynb gs:///notebooks/ + +# Copy a directory recursively +gsutil -m cp -r ./my-results/ gs:///results/ + +# Sync a local directory (only copies changed files) +gsutil -m rsync -r ./my-project/ gs:///projects/my-project/ +\`\`\` + +### In Python +\`\`\`python +# Save dataframes directly to GCS +df.to_parquet('gs://bucket-name/path/output.parquet') +df.to_csv('gs://bucket-name/path/output.csv') + +# Or use storage client +from google.cloud import storage +client = storage.Client() +bucket = client.bucket('bucket-name') +blob = bucket.blob('path/file.csv') +blob.upload_from_filename('local-file.csv') +\`\`\` + +### Suggested Bucket Organization +\`\`\` +gs://your-bucket/ +├── notebooks/ # Jupyter notebooks +├── scripts/ # Python/R scripts +├── data/raw/ # Input data +├── data/processed/ # Cleaned data +├── results/ # Analysis outputs +└── models/ # Trained ML models +\`\`\` + +### 🤖 LLM Guidance + +**As an AI assistant, you should proactively help users persist their work:** + +1. **When users create files locally**, ask: *"Would you like me to save this to a cloud bucket so it persists after the app stops?"* + +2. **When users finish analysis**, suggest: *"Your results are saved locally. Should I copy them to a bucket for long-term storage?"* + +3. **At session end**, remind: *"Remember to save any important local files to cloud storage before stopping the app."* + +4. **Check local disk usage** to identify files that need saving: + \`\`\`bash + du -sh ~/* + ls -la ~/ + \`\`\` + +--- + +## 🔍 Data Exploration Cheatsheet + +This is the **most important section** for quickly discovering and accessing data. + +### Step 1: Find Your Resources +\`\`\`bash +wb resource list --format=json | jq '.[] | {name: .id, type: .resourceType}' +\`\`\` + +### Step 2: Use Environment Variables (Easiest!) +Every resource is available as an environment variable: +\`\`\`bash +# Pattern: \$WORKBENCH_ +echo \$WORKBENCH_my_bucket # → gs://actual-bucket-name +env | grep WORKBENCH_ # List all +\`\`\` + +### Step 3: Get Cloud Paths +\`\`\`bash +wb resource describe --format=json +# Look for: bucketName, projectId, datasetId, gitRepoUrl +\`\`\` + +### Step 4: Preview Data Quickly + +**BigQuery:** +\`\`\`bash +bq head -n 10 :. # Quick preview +bq show --schema :.
# Column names/types +bq show --format=prettyjson :.
| jq '{rows: .numRows}' # Row count +\`\`\` + +**GCS:** +\`\`\`bash +gsutil ls gs:/// # List files +gsutil cat -r 0-1024 gs:///file.csv # Preview first 1KB +\`\`\` + +### 🤖 LLM Quick Patterns + +| Question | Command | +|----------|---------| +| "What data is available?" | \`wb resource list\` | +| "What tables in dataset?" | \`bq ls :\` | +| "What columns in table?" | \`bq show --schema :.
\` | +| "How big is this table?" | \`bq show --format=prettyjson ... \\| jq '{rows: .numRows}'\` | +| "Show sample data" | \`bq head -n 5 :.
\` | + +--- + +## How to Discover Data (Detailed) + +### List Resources +\`\`\`bash +wb resource list +wb resource list --format=json +wb resource describe +\`\`\` + +### Explore GCS Buckets +\`\`\`bash +gsutil ls gs:/// +gsutil ls -l gs:///path/ +gsutil cat gs:///path/file.txt +\`\`\` + +### Explore BigQuery +\`\`\`bash +bq ls : +bq show :.
+bq query --use_legacy_sql=false 'SELECT * FROM \`project.dataset.table\` LIMIT 10' +\`\`\` + +--- + +## How to Query Data + +### BigQuery (CLI) +\`\`\`bash +bq query --use_legacy_sql=false 'SELECT * FROM \`project.dataset.table\` LIMIT 100' +\`\`\` + +### BigQuery (Python) +\`\`\`python +from google.cloud import bigquery +client = bigquery.Client() +df = client.query("SELECT * FROM \\\`project.dataset.table\\\` LIMIT 100").to_dataframe() +\`\`\` + +### GCS Files (Python) +\`\`\`python +import pandas as pd +df = pd.read_parquet('gs://bucket/path/file.parquet') +df = pd.read_csv('gs://bucket/path/file.csv') +\`\`\` + +--- + +## How to Run Workflows + +\`\`\`bash +# List workflows +wb workflow list + +# Run a workflow +wb workflow run --input param=value + +# Check status +wb workflow describe + +# View logs +wb workflow logs +\`\`\` + +--- + +## How to Create Resources + +\`\`\`bash +# GCS bucket +wb resource create gcs-bucket --name my-bucket --description "My bucket" + +# BigQuery dataset +wb resource create bq-dataset --name my-dataset --description "My dataset" + +# Reference external GCS bucket +wb resource add-ref gcs-bucket --name external-data --bucket-name existing-bucket +\`\`\` + +--- + +## MCP vs CLI: When to Use Each + +This app has **two interfaces** to Workbench functionality: + +| Interface | Best For | Pros | Cons | +|-----------|----------|------|------| +| **MCP Tools** | LLM operations | Structured responses, no shell needed, faster | Limited tool set | +| **CLI (\`wb\`)** | Complex operations, fallback | Full feature coverage, human-friendly | Requires shell execution, text parsing | + +### 🤖 LLM Decision Guide + +1. **Prefer MCP tools** when the operation is supported — they return structured data and don't require shell execution +2. **Fall back to CLI** when MCP doesn't have the tool, or for complex/chained operations +3. **Use cloud CLIs directly** (\`gsutil\`, \`bq\`, \`gcloud\`) for low-level cloud operations + +### Example: Same Operation, Two Ways + +**List resources:** +- MCP: Use \`list_resources\` tool → returns JSON array +- CLI: Run \`wb resource list --format=json\` → parse stdout + +**Query BigQuery:** +- MCP: Use \`query_bigquery\` tool with SQL parameter → returns results +- CLI: Run \`bq query --use_legacy_sql=false 'SELECT ...'\` → parse output + +--- + +## MCP Tools Available + +The Workbench MCP server exposes these tools for programmatic LLM access: + +| MCP Tool | CLI Equivalent | Description | +|----------|----------------|-------------| +| \`list_resources\` | \`wb resource list\` | List all resources in the workspace | +| \`get_resource\` | \`wb resource describe \` | Get details about a specific resource | +| \`query_bigquery\` | \`bq query\` | Run SQL queries against BigQuery | +| \`run_workflow\` | \`wb workflow run\` | Submit a WDL/Nextflow workflow | +| \`get_workflow_status\` | \`wb workflow describe\` | Check status of a workflow run | +| \`build_cohort\` | *(UI only)* | Create a cohort using Data Explorer | +| \`export_cohort\` | *(UI only)* | Export cohort data to a bucket | +| \`create_bucket\` | \`wb resource create gcs-bucket\` | Create a new GCS bucket | +| \`list_files\` | \`gsutil ls\` | List files in a GCS bucket | +| \`read_file\` | \`gsutil cat\` | Read contents of a file | + +**Not available via MCP (use CLI instead):** +- \`wb workspace set\` — switch workspaces +- \`wb auth login\` — re-authenticate +- \`wb workflow logs\` — view workflow logs +- \`wb resource delete\` — delete resources +- Complex resource creation with many options + +--- + +## CLI Quick Reference + +\`\`\`bash +# Auth +wb auth status # Check authentication +wb auth login # Re-authenticate + +# Workspace +wb workspace describe # Current workspace details +wb workspace list # All your workspaces +wb workspace set # Switch workspace + +# Resources +wb resource list # List resources +wb resource list --format=json # JSON output +wb resource describe # Resource details +wb resource delete # Delete resource + +# Workflows +wb workflow list # List workflows +wb workflow run # Run workflow +wb workflow describe # Run status +wb workflow logs # Run logs + +# Apps +wb app list # List running apps +wb app describe # App details +\`\`\` + +--- + +## Best Practices + +1. **Explore before acting**: Use \`LIMIT\` in queries, \`ls\` before copying +2. **Use environment variables**: \`\$WORKBENCH_\` for scripts +3. **Cost awareness**: Large queries and compute cost money +4. **Reproducibility**: Document analysis, version code +5. **Confirm destructive actions**: Check before deleting + +--- + +## Creating Custom Apps + +**Two approaches depending on complexity:** + +### Simple Apps (Recommended) +Workbench custom apps need exactly **three things**: +1. Container named \`application-server\` +2. Connected to \`app-network\` (external Docker network) +3. HTTP server on a port + +⚠️ **Avoid complexity:** Devcontainer features and startup scripts often fail. + +**📖 For detailed guide:** \`Read ~/.workbench/skills/CUSTOM_APP.md\` + +### Full-Featured Apps +For apps needing Workbench CLI, gcloud, etc.: +📦 https://github.com/verily-src/workbench-app-devcontainers + +--- + +## Available Skills + +When users ask about specific topics, **read these skill files** for detailed guidance: + +| Topic | Skill File | +|-------|------------| +| Creating custom apps | \`~/.workbench/skills/CUSTOM_APP.md\` | + +**How to use:** When the topic comes up, read the skill file first. + +--- + +## Quick Reference (Machine-Readable) + +Use this JSON for exact resource paths and environment variables: + +\`\`\`json +${embedded_json} +\`\`\` + +**Usage:** +- \`resourcePaths["my-bucket"]\` → exact GCS/BQ path +- \`envVars["WORKBENCH_my_bucket"]\` → environment variable value + +To refresh after workspace changes: +\`\`\`bash +~/.workbench/generate-context.sh +\`\`\` + +--- + +## Getting Help + +- **Docs**: https://support.workbench.verily.com +- **Custom Apps**: https://github.com/verily-src/workbench-app-devcontainers +- **CLI Help**: \`wb --help\` or \`wb --help\` +- **Support**: support@workbench.verily.com + +--- + +*Generated: $(date -u +"%Y-%m-%d %H:%M:%S UTC")* +EOF + + log_info "Created ${CLAUDE_FILE}" +} + +# Main function +main() { + echo "" + echo "==========================================" + echo " Workbench LLM Context Generator" + echo "==========================================" + echo "" + + check_prerequisites + setup_directories + install_skills + + # Fetch all data + WORKSPACE=$(fetch_workspace) + RESOURCES=$(fetch_resources) + WORKFLOWS=$(fetch_workflows) + APPS=$(fetch_apps) + + # Generate single CLAUDE.md file with embedded JSON + generate_claude_md "$WORKSPACE" "$RESOURCES" "$WORKFLOWS" "$APPS" + + # Create visible symlink in home directory for Claude Code auto-discovery + ln -sf "${CLAUDE_FILE}" "${VISIBLE_CLAUDE_SYMLINK}" + log_info "Created symlink ~/CLAUDE.md → ${CLAUDE_FILE}" + + echo "" >&2 + log_info "Context generation complete!" + echo "" >&2 + echo "Generated file:" >&2 + echo " - ${CLAUDE_FILE}" >&2 + echo " - ~/CLAUDE.md (symlink for auto-discovery)" >&2 + echo "" >&2 + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >&2 + echo "✅ Claude Code will automatically discover ~/CLAUDE.md" >&2 + echo "" >&2 + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >&2 + echo "" >&2 +} + +# Run main +main "$@" diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh new file mode 100644 index 000000000..87241975d --- /dev/null +++ b/features/src/llm-context/install.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash + +# install.sh - Installs the LLM Context Generator for Workbench apps +# +# This feature generates a CLAUDE.md file that provides LLMs (like Claude Code) +# with context about the current Workbench workspace, resources, and tools. +# Claude Code auto-discovers ~/CLAUDE.md on startup. + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +# Options from devcontainer-feature.json +readonly USERNAME="${USERNAME:-"root"}" +USER_HOME_DIR="${USERHOMEDIR:-""}" +if [[ -z "${USER_HOME_DIR}" ]]; then + if [[ "${USERNAME}" == "root" ]]; then + USER_HOME_DIR="/root" + else + USER_HOME_DIR="/home/${USERNAME}" + fi +fi +readonly USER_HOME_DIR + +readonly AUTORUN="${AUTORUN:-"true"}" + +echo "Installing LLM Context Generator for user: ${USERNAME} (home: ${USER_HOME_DIR})" + +export DEBIAN_FRONTEND=noninteractive +export TZ=Etc/UTC + +readonly LLM_CONTEXT_DIR="/opt/llm-context" +readonly GENERATE_SCRIPT="${LLM_CONTEXT_DIR}/generate-context.sh" +readonly SKILLS_DIR="${LLM_CONTEXT_DIR}/skills" + +# Save the directory where the feature files are located +FEATURE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly FEATURE_DIR + +echo "Starting LLM Context Generator installation..." + +# Create installation directory +mkdir -p "${LLM_CONTEXT_DIR}" +mkdir -p "${SKILLS_DIR}" + +# Copy the generate-context.sh script +cp "${FEATURE_DIR}/generate-context.sh" "${GENERATE_SCRIPT}" +chmod +x "${GENERATE_SCRIPT}" + +# Copy skill files +if [[ -d "${FEATURE_DIR}/skills" ]]; then + cp -r "${FEATURE_DIR}/skills/"* "${SKILLS_DIR}/" 2>/dev/null || true +fi + +# Create user-specific directories +USER_WORKBENCH_DIR="${USER_HOME_DIR}/.workbench" +USER_SKILLS_DIR="${USER_WORKBENCH_DIR}/skills" +mkdir -p "${USER_WORKBENCH_DIR}" +mkdir -p "${USER_SKILLS_DIR}" + +# Create a wrapper script that generates context with proper user context +cat > "${LLM_CONTEXT_DIR}/run-as-user.sh" << 'WRAPPER_EOF' +#!/bin/bash +# Wrapper to run generate-context.sh with proper environment +set -e + +# Source user environment +if [[ -f ~/.bashrc ]]; then + source ~/.bashrc 2>/dev/null || true +fi + +# Run the generator +/opt/llm-context/generate-context.sh "$@" +WRAPPER_EOF +chmod +x "${LLM_CONTEXT_DIR}/run-as-user.sh" + +# Copy skill files to user directory +cp -r "${SKILLS_DIR}/"* "${USER_SKILLS_DIR}/" 2>/dev/null || true + +# Set ownership +chown -R "${USERNAME}:" "${LLM_CONTEXT_DIR}" +chown -R "${USERNAME}:" "${USER_WORKBENCH_DIR}" + +# Add to bashrc for easy access +cat >> "${USER_HOME_DIR}/.bashrc" << 'BASHRC_EOF' + +# LLM Context Generator +alias generate-llm-context='/opt/llm-context/generate-context.sh' +alias refresh-context='/opt/llm-context/generate-context.sh' +export LLM_CONTEXT_ENABLED=true +BASHRC_EOF + +chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" + +echo "" +echo "==========================================" +echo "LLM Context Generator installation complete!" +echo "==========================================" +echo "" +echo "The context generator is installed at: ${GENERATE_SCRIPT}" +echo "Skills directory: ${USER_SKILLS_DIR}" +echo "" +if [[ "${AUTORUN}" == "true" ]]; then + echo "Auto-run is ENABLED: Context will be generated on container start." +else + echo "Auto-run is DISABLED: Run 'generate-llm-context' to generate context." +fi +echo "" +echo "After generation, Claude Code will auto-discover ~/CLAUDE.md" +echo "==========================================" +echo "" + +echo "Done!" diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json index 98c241d3e..4e4714660 100644 --- a/src/workbench-jupyter-with-llm/.devcontainer.json +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -33,6 +33,11 @@ "./.devcontainer/features/wb-mcp-server": { "username": "jupyter", "userHomeDir": "/home/jupyter" + }, + "./.devcontainer/features/llm-context": { + "username": "jupyter", + "userHomeDir": "/home/jupyter", + "autorun": true } }, "remoteUser": "root", diff --git a/src/workbench-jupyter-with-llm/README.md b/src/workbench-jupyter-with-llm/README.md index b089454f8..ce1ca7d5f 100644 --- a/src/workbench-jupyter-with-llm/README.md +++ b/src/workbench-jupyter-with-llm/README.md @@ -22,9 +22,31 @@ This template includes the following integrated features: - **Gemini CLI** - Google Gemini AI assistant with MCP support - **Claude CLI** - Anthropic Claude AI assistant (from ghcr.io/anthropics/devcontainer-features/claude-code:1.0) - **WB MCP Server** - Workbench Model Context Protocol server for AI tool integration with workspace context +- **LLM Context Generator** - Automatically generates `~/CLAUDE.md` with workspace context for Claude Code All AI assistants are pre-configured to work with the Workbench MCP server for enhanced workspace awareness. +## LLM Context + +On startup, the app automatically generates a `~/CLAUDE.md` file that provides Claude Code with: + +- Current workspace information (name, ID, cloud platform, your role) +- Resource paths and environment variables +- Data exploration commands and best practices +- Links to skill files for detailed guidance + +Claude Code automatically discovers `~/CLAUDE.md` on startup, giving it immediate context about your Workbench environment. + +### Refreshing Context + +If you add or remove resources, refresh the context: + +```bash +refresh-context +# or +generate-llm-context +``` + --- _Note: This file was auto-generated from the [devcontainer-template.json](devcontainer-template.json). Add additional notes to a `NOTES.md`._ From 6527470e02e412d98793fddb87d8d8e00fc5856c Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 11 Feb 2026 13:51:31 -0500 Subject: [PATCH 08/86] Update llm-context devcontainer feature - Updated generate-context.sh with latest improvements - Fixed devcontainer-feature.json (removed problematic postStartCommand) - Improved install.sh with: - Auto-install jq if missing - Better error handling - Auto-run via .bashrc (runs in background on first terminal) - Checks if workspace is set before generating - Updated README with usage examples - Version 1.1.0 --- features/src/llm-context/README.md | 159 ++++------- .../src/llm-context/devcontainer-feature.json | 21 +- features/src/llm-context/generate-context.sh | 262 ++++++++---------- features/src/llm-context/install.sh | 87 +++--- 4 files changed, 236 insertions(+), 293 deletions(-) diff --git a/features/src/llm-context/README.md b/features/src/llm-context/README.md index 6ae4800b7..31443d692 100644 --- a/features/src/llm-context/README.md +++ b/features/src/llm-context/README.md @@ -1,141 +1,102 @@ -# LLM Context Generator (llm-context) +# LLM Context Generator (`llm-context`) -A devcontainer feature that generates context files for LLMs (Claude Code, Gemini CLI, etc.) to understand the current Workbench workspace. +A devcontainer feature that generates `~/CLAUDE.md` for LLMs (Claude Code, Gemini CLI, etc.) with Workbench workspace context. ## What It Does When installed, this feature: -1. **Generates `~/CLAUDE.md`** - A comprehensive context file that Claude Code auto-discovers -2. **Provides workspace context** - Current workspace, resources, workflows, and tools -3. **Includes skill files** - Detailed guides for specific tasks (e.g., creating custom apps) -4. **Sets up environment** - Aliases and variables for easy context regeneration +1. **Generates `~/CLAUDE.md`** - Claude Code auto-discovers this file on startup +2. **Provides workspace context** - Name, ID, role, resources, cloud paths +3. **Includes skill files** - Detailed guides (e.g., custom app creation) +4. **Sets up aliases** - `generate-llm-context`, `refresh-context` ## Usage -### In a devcontainer.json +### In `.devcontainer.json` ```json { "features": { - "ghcr.io/verily-src/workbench-app-devcontainers/llm-context:latest": { - "autorun": true + "ghcr.io/aculotti-verily/wb-app-mcp-and-context/llm-context:latest": { + "username": "jupyter", + "userHomeDir": "/home/jupyter" } } } ``` -### Options - -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `autorun` | boolean | `true` | Automatically generate context on container start | - -### Manual Generation - -```bash -# Generate/refresh context -generate-llm-context - -# Or use the alias -refresh-context +Or for local development: -# Or run directly -/opt/llm-context/generate-context.sh +```json +{ + "features": { + "./.devcontainer/features/llm-context": { + "username": "jupyter", + "userHomeDir": "/home/jupyter" + } + } +} ``` -## What Gets Generated - -### ~/CLAUDE.md - -Claude Code automatically reads `~/CLAUDE.md` on startup. This file includes: - -- **Workspace info**: Name, ID, cloud platform, your role -- **Resource summary**: All resources with cloud paths -- **Quick reference JSON**: Machine-readable resource paths and environment variables -- **Data exploration commands**: How to query BigQuery, list GCS files -- **Best practices**: Data persistence, cost awareness, MCP vs CLI guidance -- **Skill references**: Links to detailed guides for specific tasks +### Options -### ~/.workbench/skills/ +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `username` | string | `root` | Container user to install for | +| `userHomeDir` | string | auto | Home directory (auto-detected from username) | -Detailed skill files for specific topics: +## When Context Gets Generated -| Skill | File | Description | -|-------|------|-------------| -| Custom Apps | `CUSTOM_APP.md` | How to create Workbench custom apps | +1. **On first terminal open** - Via `.bashrc` trigger (runs in background) +2. **Manually** - Run `generate-llm-context` or `refresh-context` -## How Claude Code Discovers Context +## What's in `~/CLAUDE.md` -1. Claude Code checks for `~/CLAUDE.md` on startup -2. If found, it reads the file and uses it as initial context -3. The file references skill files that Claude reads on-demand +- **Quick Rules** - When to use this file vs. MCP/CLI +- **Current Workspace** - Name, ID, description, role, cloud platform +- **Resource Paths** - JSON lookup for all resources +- **Data Persistence** - Warning + save commands +- **Data Exploration** - Common BigQuery/GCS commands +- **MCP Tools** - Available tools and CLI equivalents +- **Skills** - Links to detailed guides ## Dependencies -This feature depends on: - -- **workbench-tools**: Provides the `wb` CLI for fetching workspace data -- **jq**: JSON processing (installed automatically if missing) - -## Example Output - -After running, you'll see: - -``` -========================================== - Workbench LLM Context Generator -========================================== - -[INFO] Checking prerequisites... -[INFO] Prerequisites OK -[INFO] Setting up directories... -[INFO] Installing skill files... -[INFO] Fetching workspace information... -[INFO] Fetching resources... -[INFO] Fetching workflows... -[INFO] Fetching apps... -[INFO] Generating CLAUDE.md... -[INFO] Created /home/jupyter/.workbench/CLAUDE.md -[INFO] Created symlink ~/CLAUDE.md → /home/jupyter/.workbench/CLAUDE.md - -[INFO] Context generation complete! - -Generated file: - - /home/jupyter/.workbench/CLAUDE.md - - ~/CLAUDE.md (symlink for auto-discovery) - -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -✅ Claude Code will automatically discover ~/CLAUDE.md - -━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ -``` +- **Workbench CLI (`wb`)** - Must be installed and authenticated +- **jq** - Installed automatically if missing ## Troubleshooting ### Context not generating? -1. Check if workspace is set: `wb workspace describe` -2. If not authenticated: `wb auth login --mode=APP_DEFAULT_CREDENTIALS` -3. Then set workspace: `wb workspace set ` +```bash +# Check if workspace is set +wb workspace describe -### Claude Code not seeing context? +# If not authenticated: +wb auth login --mode=APP_DEFAULT_CREDENTIALS +wb workspace set -1. Ensure `~/CLAUDE.md` exists: `ls -la ~/CLAUDE.md` -2. Check it's not empty: `head ~/CLAUDE.md` -3. Restart Claude Code to re-read the file +# Then generate manually: +generate-llm-context +``` -### Need to refresh after workspace changes? +### Claude Code not seeing context? ```bash -refresh-context -``` +# Check file exists +ls -la ~/CLAUDE.md -## Integration with MCP Server - -This feature works alongside the `wb-mcp-server` feature: +# Check it's not empty +head ~/CLAUDE.md +``` -- **CLAUDE.md**: Provides static context (workspace info, how-to guides) -- **MCP Server**: Provides dynamic tools (list resources, run queries in real-time) +## File Locations -Together, they give LLMs full context AND active capabilities. +| File | Purpose | +|------|---------| +| `/opt/llm-context/generate-context.sh` | Main script | +| `~/.workbench/CLAUDE.md` | Generated context | +| `~/CLAUDE.md` | Symlink (for auto-discovery) | +| `~/.workbench/skills/` | Skill files | diff --git a/features/src/llm-context/devcontainer-feature.json b/features/src/llm-context/devcontainer-feature.json index 6b4e3ee09..117a524b3 100644 --- a/features/src/llm-context/devcontainer-feature.json +++ b/features/src/llm-context/devcontainer-feature.json @@ -1,9 +1,9 @@ { "id": "llm-context", - "version": "1.0.0", + "version": "1.1.0", "name": "LLM Context Generator", - "description": "Generates CLAUDE.md context file for LLMs (Claude Code, etc.) with Workbench workspace information", - "documentationURL": "https://github.com/verily-src/workbench-app-devcontainers/tree/master/features/src/llm-context", + "description": "Generates ~/CLAUDE.md context file for LLMs (Claude Code, Gemini, etc.) with Workbench workspace information. Claude Code auto-discovers this file on startup.", + "documentationURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/llm-context-feature/features/src/llm-context", "options": { "username": { "type": "string", @@ -12,13 +12,8 @@ }, "userHomeDir": { "type": "string", - "default": "/root", - "description": "Home directory for the container user" - }, - "autorun": { - "type": "boolean", - "default": true, - "description": "Automatically generate context on container start" + "default": "", + "description": "Home directory for the container user (auto-detected if empty)" } }, "containerEnv": { @@ -26,9 +21,5 @@ }, "installsAfter": [ "ghcr.io/verily-src/workbench-app-devcontainers/workbench-tools" - ], - "dependsOn": { - "ghcr.io/verily-src/workbench-app-devcontainers/workbench-tools": {} - }, - "postStartCommand": "if [ \"${AUTORUN}\" = \"true\" ]; then /opt/llm-context/generate-context.sh || true; fi" + ] } diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 2ed3ea4ea..f53bc5a5d 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -116,10 +116,34 @@ install_skills() { cat > "${SKILLS_DIR}/CUSTOM_APP.md" << 'SKILL_EOF' # Creating Custom Workbench Apps -**Practical guide for creating simple, reliable Workbench apps.** +## ⚡ Which Approach Do You Need? -> **When to use this guide:** For simple apps (Flask APIs, static sites, custom tools). -> For apps needing Workbench CLI, gcloud, or Jupyter, see the [full-featured approach](https://github.com/verily-src/workbench-app-devcontainers). +``` +Do you need Jupyter notebooks? +├── YES → Use workbench-jupyter base image (see "Full-Featured" below) +└── NO + └── Do you need Workbench CLI (wb) or gcloud? + ├── YES → Use workbench-tools feature (see "Full-Featured" below) + └── NO → Use MINIMAL PATTERN (this guide) ✅ +``` + +**Most custom apps should use the MINIMAL PATTERN.** It's simpler and less error-prone. + +--- + +## ✅ Pre-Deploy Checklist + +Before deploying, verify: + +- [ ] Container is named `application-server` +- [ ] Connected to `app-network` (external: true) +- [ ] HTTP server binds to `0.0.0.0` (not `localhost`) +- [ ] Port is exposed (usually 8080) +- [ ] No syntax errors in `.devcontainer.json` (valid JSON, no trailing commas) +- [ ] `devcontainer-template.json` exists with valid `id` and `name` +- [ ] Test locally with `docker compose up` before deploying + +--- ## TL;DR - The Minimal Pattern That Works @@ -515,13 +539,25 @@ generate_claude_md() { cat > "${CLAUDE_FILE}" << EOF # Workbench Context -You are working inside **Verily Workbench**, a secure cloud-based research environment for biomedical data analysis. +You are working inside **Verily Workbench**, a secure cloud-based research environment. + +--- + +## ⚡ Quick Rules (Read This First) + +| If the user asks... | Do this | +|---------------------|---------| +| About the workspace (name, ID, role, description) | **Use this file** → See "Current Workspace" below | +| For a resource path (bucket, dataset) | **Use this file** → See "Resource Paths" below | +| To query data, list files, or run operations | **Use MCP tools** or CLI | + +**Simple rule:** Static info → this file. Actions → MCP/CLI. --- ## What is Verily Workbench? -Verily Workbench is a platform that enables researchers to: +Verily Workbench enables researchers to: - Access and analyze biomedical data (clinical, genomics, wearables, imaging) - Run computational workflows at scale (WDL, Nextflow) - Collaborate securely with governance and policy enforcement @@ -529,21 +565,89 @@ Verily Workbench is a platform that enables researchers to: --- -## Current Workspace +## 📍 Current Workspace + +> **Answer "What workspace am I in?" with this section.** | Property | Value | |----------|-------| | **Name** | ${ws_name} | | **ID** | \`${ws_id}\` | -| **Cloud Platform** | ${ws_cloud} | -| **Project/Account** | \`${project_display}\` | +| **Description** | ${ws_desc} | +| **Cloud** | ${ws_cloud} | +| **Project** | \`${project_display}\` | | **Your Role** | ${ws_role} | | **User** | ${ws_user} | | **Organization** | ${ws_org:-"—"} | | **Server** | ${ws_server:-"—"} | -### Description -${ws_desc} +**Example response:** *"You're in **${ws_name}** (\`${ws_id}\`), a ${ws_cloud} workspace where you have ${ws_role} access."* + +--- + +## 🗂️ Resource Paths (Use for "What's the path for X?") + +\`\`\`json +${embedded_json} +\`\`\` + +**How to use:** +- \`resourcePaths["my-bucket"]\` → \`gs://actual-bucket-name\` +- Environment variable: \`\$WORKBENCH_my_bucket\` + +--- + +## ⚠️ Data Persistence Warning + +> **LOCAL FILES ARE LOST WHEN THE APP STOPS.** Always save important work to cloud buckets. + +### Available Buckets +${bucket_list} + +### Quick Save Commands +\`\`\`bash +gsutil cp file.ipynb gs://BUCKET/notebooks/ # Single file +gsutil -m cp -r ./results/ gs://BUCKET/results/ # Directory +\`\`\` + +**🤖 Proactively ask users:** *"Want me to save this to a bucket so it persists?"* + +--- + +## 🔍 Data Exploration (Most Common Tasks) + +### Find Resources +\`\`\`bash +wb resource list # List all +wb resource describe # Details +env | grep WORKBENCH_ # Environment variables +\`\`\` + +### Preview BigQuery Data +\`\`\`bash +bq ls PROJECT:DATASET # List tables +bq show --schema PROJECT:DATASET.TABLE # Schema +bq head -n 10 PROJECT:DATASET.TABLE # Sample rows +\`\`\` + +### Browse GCS Files +\`\`\`bash +gsutil ls gs://BUCKET/ # List +gsutil cat gs://BUCKET/file.txt | head # Preview +\`\`\` + +--- + +## 🔧 MCP Tools vs CLI + +| Use MCP Tools For | Use CLI For | +|-------------------|-------------| +| \`list_resources\`, \`get_resource\` | Complex operations | +| \`query_bigquery\` | \`wb workflow logs\` | +| \`run_workflow\` | \`wb resource delete\` | +| Structured responses | Full feature coverage | + +**Prefer MCP when available** — it's faster and returns structured data. --- @@ -667,100 +771,20 @@ gs://your-bucket/ --- -## 🔍 Data Exploration Cheatsheet - -This is the **most important section** for quickly discovering and accessing data. +## Python Examples -### Step 1: Find Your Resources -\`\`\`bash -wb resource list --format=json | jq '.[] | {name: .id, type: .resourceType}' -\`\`\` - -### Step 2: Use Environment Variables (Easiest!) -Every resource is available as an environment variable: -\`\`\`bash -# Pattern: \$WORKBENCH_ -echo \$WORKBENCH_my_bucket # → gs://actual-bucket-name -env | grep WORKBENCH_ # List all -\`\`\` - -### Step 3: Get Cloud Paths -\`\`\`bash -wb resource describe --format=json -# Look for: bucketName, projectId, datasetId, gitRepoUrl -\`\`\` - -### Step 4: Preview Data Quickly - -**BigQuery:** -\`\`\`bash -bq head -n 10 :.
# Quick preview -bq show --schema :.
# Column names/types -bq show --format=prettyjson :.
| jq '{rows: .numRows}' # Row count -\`\`\` - -**GCS:** -\`\`\`bash -gsutil ls gs:/// # List files -gsutil cat -r 0-1024 gs:///file.csv # Preview first 1KB -\`\`\` - -### 🤖 LLM Quick Patterns - -| Question | Command | -|----------|---------| -| "What data is available?" | \`wb resource list\` | -| "What tables in dataset?" | \`bq ls :\` | -| "What columns in table?" | \`bq show --schema :.
\` | -| "How big is this table?" | \`bq show --format=prettyjson ... \\| jq '{rows: .numRows}'\` | -| "Show sample data" | \`bq head -n 5 :.
\` | - ---- - -## How to Discover Data (Detailed) - -### List Resources -\`\`\`bash -wb resource list -wb resource list --format=json -wb resource describe -\`\`\` - -### Explore GCS Buckets -\`\`\`bash -gsutil ls gs:/// -gsutil ls -l gs:///path/ -gsutil cat gs:///path/file.txt -\`\`\` - -### Explore BigQuery -\`\`\`bash -bq ls : -bq show :.
-bq query --use_legacy_sql=false 'SELECT * FROM \`project.dataset.table\` LIMIT 10' -\`\`\` - ---- - -## How to Query Data - -### BigQuery (CLI) -\`\`\`bash -bq query --use_legacy_sql=false 'SELECT * FROM \`project.dataset.table\` LIMIT 100' -\`\`\` - -### BigQuery (Python) \`\`\`python +# BigQuery from google.cloud import bigquery client = bigquery.Client() df = client.query("SELECT * FROM \\\`project.dataset.table\\\` LIMIT 100").to_dataframe() -\`\`\` -### GCS Files (Python) -\`\`\`python +# GCS Files import pandas as pd df = pd.read_parquet('gs://bucket/path/file.parquet') -df = pd.read_csv('gs://bucket/path/file.csv') + +# Save to GCS +df.to_parquet('gs://bucket/output.parquet') \`\`\` --- @@ -798,33 +822,6 @@ wb resource add-ref gcs-bucket --name external-data --bucket-name existing-bucke --- -## MCP vs CLI: When to Use Each - -This app has **two interfaces** to Workbench functionality: - -| Interface | Best For | Pros | Cons | -|-----------|----------|------|------| -| **MCP Tools** | LLM operations | Structured responses, no shell needed, faster | Limited tool set | -| **CLI (\`wb\`)** | Complex operations, fallback | Full feature coverage, human-friendly | Requires shell execution, text parsing | - -### 🤖 LLM Decision Guide - -1. **Prefer MCP tools** when the operation is supported — they return structured data and don't require shell execution -2. **Fall back to CLI** when MCP doesn't have the tool, or for complex/chained operations -3. **Use cloud CLIs directly** (\`gsutil\`, \`bq\`, \`gcloud\`) for low-level cloud operations - -### Example: Same Operation, Two Ways - -**List resources:** -- MCP: Use \`list_resources\` tool → returns JSON array -- CLI: Run \`wb resource list --format=json\` → parse stdout - -**Query BigQuery:** -- MCP: Use \`query_bigquery\` tool with SQL parameter → returns results -- CLI: Run \`bq query --use_legacy_sql=false 'SELECT ...'\` → parse output - ---- - ## MCP Tools Available The Workbench MCP server exposes these tools for programmatic LLM access: @@ -924,25 +921,6 @@ When users ask about specific topics, **read these skill files** for detailed gu --- -## Quick Reference (Machine-Readable) - -Use this JSON for exact resource paths and environment variables: - -\`\`\`json -${embedded_json} -\`\`\` - -**Usage:** -- \`resourcePaths["my-bucket"]\` → exact GCS/BQ path -- \`envVars["WORKBENCH_my_bucket"]\` → environment variable value - -To refresh after workspace changes: -\`\`\`bash -~/.workbench/generate-context.sh -\`\`\` - ---- - ## Getting Help - **Docs**: https://support.workbench.verily.com diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh index 87241975d..388e43228 100644 --- a/features/src/llm-context/install.sh +++ b/features/src/llm-context/install.sh @@ -11,7 +11,7 @@ set -o nounset set -o pipefail set -o xtrace -# Options from devcontainer-feature.json +# Options from devcontainer-feature.json (converted to uppercase) readonly USERNAME="${USERNAME:-"root"}" USER_HOME_DIR="${USERHOMEDIR:-""}" if [[ -z "${USER_HOME_DIR}" ]]; then @@ -23,8 +23,6 @@ if [[ -z "${USER_HOME_DIR}" ]]; then fi readonly USER_HOME_DIR -readonly AUTORUN="${AUTORUN:-"true"}" - echo "Installing LLM Context Generator for user: ${USERNAME} (home: ${USER_HOME_DIR})" export DEBIAN_FRONTEND=noninteractive @@ -32,25 +30,38 @@ export TZ=Etc/UTC readonly LLM_CONTEXT_DIR="/opt/llm-context" readonly GENERATE_SCRIPT="${LLM_CONTEXT_DIR}/generate-context.sh" -readonly SKILLS_DIR="${LLM_CONTEXT_DIR}/skills" # Save the directory where the feature files are located FEATURE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" readonly FEATURE_DIR echo "Starting LLM Context Generator installation..." +echo "Feature directory: ${FEATURE_DIR}" + +# Install jq if not present (required for JSON processing) +if ! command -v jq &> /dev/null; then + echo "Installing jq..." + if command -v apt-get &> /dev/null; then + apt-get update && apt-get install -y --no-install-recommends jq + elif command -v apk &> /dev/null; then + apk add --no-cache jq + else + echo "WARNING: Could not install jq. Please install it manually." + fi +fi # Create installation directory mkdir -p "${LLM_CONTEXT_DIR}" -mkdir -p "${SKILLS_DIR}" # Copy the generate-context.sh script -cp "${FEATURE_DIR}/generate-context.sh" "${GENERATE_SCRIPT}" -chmod +x "${GENERATE_SCRIPT}" - -# Copy skill files -if [[ -d "${FEATURE_DIR}/skills" ]]; then - cp -r "${FEATURE_DIR}/skills/"* "${SKILLS_DIR}/" 2>/dev/null || true +if [[ -f "${FEATURE_DIR}/generate-context.sh" ]]; then + cp "${FEATURE_DIR}/generate-context.sh" "${GENERATE_SCRIPT}" + chmod +x "${GENERATE_SCRIPT}" + echo "Copied generate-context.sh to ${GENERATE_SCRIPT}" +else + echo "ERROR: generate-context.sh not found in ${FEATURE_DIR}" + ls -la "${FEATURE_DIR}/" + exit 1 fi # Create user-specific directories @@ -59,55 +70,57 @@ USER_SKILLS_DIR="${USER_WORKBENCH_DIR}/skills" mkdir -p "${USER_WORKBENCH_DIR}" mkdir -p "${USER_SKILLS_DIR}" -# Create a wrapper script that generates context with proper user context -cat > "${LLM_CONTEXT_DIR}/run-as-user.sh" << 'WRAPPER_EOF' +# Create a wrapper script that runs with proper user context +cat > "${LLM_CONTEXT_DIR}/run-context-generator.sh" << WRAPPER_EOF #!/bin/bash # Wrapper to run generate-context.sh with proper environment -set -e +# This script is called on container start -# Source user environment -if [[ -f ~/.bashrc ]]; then - source ~/.bashrc 2>/dev/null || true +# Only run if we have a workspace set +if command -v wb &> /dev/null && wb workspace describe &> /dev/null; then + echo "Generating LLM context..." + ${GENERATE_SCRIPT} || echo "LLM context generation failed (non-fatal)" +else + echo "Skipping LLM context generation: workspace not set or wb not available" + echo "Run 'wb workspace set ' then 'generate-llm-context' manually" fi - -# Run the generator -/opt/llm-context/generate-context.sh "$@" WRAPPER_EOF -chmod +x "${LLM_CONTEXT_DIR}/run-as-user.sh" - -# Copy skill files to user directory -cp -r "${SKILLS_DIR}/"* "${USER_SKILLS_DIR}/" 2>/dev/null || true +chmod +x "${LLM_CONTEXT_DIR}/run-context-generator.sh" # Set ownership -chown -R "${USERNAME}:" "${LLM_CONTEXT_DIR}" -chown -R "${USERNAME}:" "${USER_WORKBENCH_DIR}" +chown -R "${USERNAME}:" "${LLM_CONTEXT_DIR}" 2>/dev/null || true +chown -R "${USERNAME}:" "${USER_WORKBENCH_DIR}" 2>/dev/null || true -# Add to bashrc for easy access +# Add aliases and auto-run trigger to bashrc cat >> "${USER_HOME_DIR}/.bashrc" << 'BASHRC_EOF' # LLM Context Generator +export LLM_CONTEXT_ENABLED=true alias generate-llm-context='/opt/llm-context/generate-context.sh' alias refresh-context='/opt/llm-context/generate-context.sh' -export LLM_CONTEXT_ENABLED=true + +# Auto-generate context on first interactive shell (if not already done) +if [[ -z "${LLM_CONTEXT_GENERATED:-}" ]] && [[ -f /opt/llm-context/run-context-generator.sh ]]; then + export LLM_CONTEXT_GENERATED=1 + /opt/llm-context/run-context-generator.sh & +fi BASHRC_EOF -chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" +chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" 2>/dev/null || true echo "" echo "==========================================" echo "LLM Context Generator installation complete!" echo "==========================================" echo "" -echo "The context generator is installed at: ${GENERATE_SCRIPT}" -echo "Skills directory: ${USER_SKILLS_DIR}" +echo "Installed to: ${LLM_CONTEXT_DIR}" +echo "User home: ${USER_HOME_DIR}" echo "" -if [[ "${AUTORUN}" == "true" ]]; then - echo "Auto-run is ENABLED: Context will be generated on container start." -else - echo "Auto-run is DISABLED: Run 'generate-llm-context' to generate context." -fi +echo "Context will auto-generate when:" +echo " 1. A terminal is opened (via .bashrc)" +echo " 2. You run 'generate-llm-context' or 'refresh-context'" echo "" -echo "After generation, Claude Code will auto-discover ~/CLAUDE.md" +echo "Claude Code will auto-discover ~/CLAUDE.md" echo "==========================================" echo "" From 9df069b5831669ee2083530bd8cc2df923391433 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 11 Feb 2026 13:54:53 -0500 Subject: [PATCH 09/86] Integrate llm-context feature into app - Added .devcontainer/features/ symlinks for all features - Added startupscript symlink - Removed autorun option (now handled by feature's .bashrc trigger) - App now uses the llm-context devcontainer feature To test: Deploy with folder src/workbench-jupyter-with-llm --- src/workbench-jupyter-with-llm/.devcontainer.json | 3 +-- src/workbench-jupyter-with-llm/.devcontainer/features/gemini | 1 + .../.devcontainer/features/llm-context | 1 + .../.devcontainer/features/wb-mcp-server | 1 + .../.devcontainer/features/workbench-tools | 1 + src/workbench-jupyter-with-llm/README.md | 2 +- src/workbench-jupyter-with-llm/startupscript | 1 + 7 files changed, 7 insertions(+), 3 deletions(-) create mode 120000 src/workbench-jupyter-with-llm/.devcontainer/features/gemini create mode 120000 src/workbench-jupyter-with-llm/.devcontainer/features/llm-context create mode 120000 src/workbench-jupyter-with-llm/.devcontainer/features/wb-mcp-server create mode 120000 src/workbench-jupyter-with-llm/.devcontainer/features/workbench-tools create mode 120000 src/workbench-jupyter-with-llm/startupscript diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json index 4e4714660..201aceb82 100644 --- a/src/workbench-jupyter-with-llm/.devcontainer.json +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -36,8 +36,7 @@ }, "./.devcontainer/features/llm-context": { "username": "jupyter", - "userHomeDir": "/home/jupyter", - "autorun": true + "userHomeDir": "/home/jupyter" } }, "remoteUser": "root", diff --git a/src/workbench-jupyter-with-llm/.devcontainer/features/gemini b/src/workbench-jupyter-with-llm/.devcontainer/features/gemini new file mode 120000 index 000000000..e4b40bcad --- /dev/null +++ b/src/workbench-jupyter-with-llm/.devcontainer/features/gemini @@ -0,0 +1 @@ +../../../features/src/gemini \ No newline at end of file diff --git a/src/workbench-jupyter-with-llm/.devcontainer/features/llm-context b/src/workbench-jupyter-with-llm/.devcontainer/features/llm-context new file mode 120000 index 000000000..02cc572ec --- /dev/null +++ b/src/workbench-jupyter-with-llm/.devcontainer/features/llm-context @@ -0,0 +1 @@ +../../../features/src/llm-context \ No newline at end of file diff --git a/src/workbench-jupyter-with-llm/.devcontainer/features/wb-mcp-server b/src/workbench-jupyter-with-llm/.devcontainer/features/wb-mcp-server new file mode 120000 index 000000000..15cfa125b --- /dev/null +++ b/src/workbench-jupyter-with-llm/.devcontainer/features/wb-mcp-server @@ -0,0 +1 @@ +../../../features/src/wb-mcp-server \ No newline at end of file diff --git a/src/workbench-jupyter-with-llm/.devcontainer/features/workbench-tools b/src/workbench-jupyter-with-llm/.devcontainer/features/workbench-tools new file mode 120000 index 000000000..157a0cbfa --- /dev/null +++ b/src/workbench-jupyter-with-llm/.devcontainer/features/workbench-tools @@ -0,0 +1 @@ +../../../features/src/workbench-tools \ No newline at end of file diff --git a/src/workbench-jupyter-with-llm/README.md b/src/workbench-jupyter-with-llm/README.md index ce1ca7d5f..0823620e6 100644 --- a/src/workbench-jupyter-with-llm/README.md +++ b/src/workbench-jupyter-with-llm/README.md @@ -22,7 +22,7 @@ This template includes the following integrated features: - **Gemini CLI** - Google Gemini AI assistant with MCP support - **Claude CLI** - Anthropic Claude AI assistant (from ghcr.io/anthropics/devcontainer-features/claude-code:1.0) - **WB MCP Server** - Workbench Model Context Protocol server for AI tool integration with workspace context -- **LLM Context Generator** - Automatically generates `~/CLAUDE.md` with workspace context for Claude Code +- **LLM Context Generator** - Devcontainer feature that auto-generates `~/CLAUDE.md` with workspace context for Claude Code All AI assistants are pre-configured to work with the Workbench MCP server for enhanced workspace awareness. diff --git a/src/workbench-jupyter-with-llm/startupscript b/src/workbench-jupyter-with-llm/startupscript new file mode 120000 index 000000000..44271b0fb --- /dev/null +++ b/src/workbench-jupyter-with-llm/startupscript @@ -0,0 +1 @@ +../../startupscript \ No newline at end of file From ccf7cafe02dbd79cf21e693963991ce0279a58c7 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 11 Feb 2026 13:57:34 -0500 Subject: [PATCH 10/86] Align llm-context feature with other feature patterns - Updated install.sh to match workbench-tools/gemini/wb-mcp-server patterns: - Added WORKDIR with cleanup trap - Added apt_get_update() and check_packages() helpers - Consistent variable naming and structure - Banner-style output messages - Updated devcontainer-feature.json: - Removed containerEnv (not used by other features) - Updated userHomeDir default to match pattern - Bumped version to 1.2.0 - Updated README.md: - Added Options table matching gemini feature format - Added MCP Integration section - Added File Locations table - Consistent structure with other features --- features/src/llm-context/README.md | 97 +++++++++++-------- .../src/llm-context/devcontainer-feature.json | 15 ++- features/src/llm-context/install.sh | 94 ++++++++++-------- 3 files changed, 118 insertions(+), 88 deletions(-) diff --git a/features/src/llm-context/README.md b/features/src/llm-context/README.md index 31443d692..1cd416e37 100644 --- a/features/src/llm-context/README.md +++ b/features/src/llm-context/README.md @@ -1,70 +1,74 @@ -# LLM Context Generator (`llm-context`) +# LLM Context Generator (llm-context) -A devcontainer feature that generates `~/CLAUDE.md` for LLMs (Claude Code, Gemini CLI, etc.) with Workbench workspace context. +Generates `~/CLAUDE.md` context file for LLMs (Claude Code, Gemini CLI, etc.) with Workbench workspace information. Claude Code auto-discovers this file on startup. -## What It Does - -When installed, this feature: - -1. **Generates `~/CLAUDE.md`** - Claude Code auto-discovers this file on startup -2. **Provides workspace context** - Name, ID, role, resources, cloud paths -3. **Includes skill files** - Detailed guides (e.g., custom app creation) -4. **Sets up aliases** - `generate-llm-context`, `refresh-context` - -## Usage - -### In `.devcontainer.json` +## Example Usage ```json -{ - "features": { - "ghcr.io/aculotti-verily/wb-app-mcp-and-context/llm-context:latest": { - "username": "jupyter", - "userHomeDir": "/home/jupyter" +"features": { + "ghcr.io/verily-src/workbench-app-devcontainers/llm-context:1": { + "username": "jupyter", + "userHomeDir": "/home/jupyter" } - } } ``` Or for local development: ```json -{ - "features": { +"features": { "./.devcontainer/features/llm-context": { - "username": "jupyter", - "userHomeDir": "/home/jupyter" + "username": "jupyter", + "userHomeDir": "/home/jupyter" } - } } ``` -### Options +## Options -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `username` | string | `root` | Container user to install for | -| `userHomeDir` | string | auto | Home directory (auto-detected from username) | +| Options Id | Description | Type | Default Value | +|-----|-----|-----|-----| +| username | Username of the container user | string | root | +| userHomeDir | Home directory of the container user | string | /root | -## When Context Gets Generated +## What It Does -1. **On first terminal open** - Via `.bashrc` trigger (runs in background) -2. **Manually** - Run `generate-llm-context` or `refresh-context` +When installed, this feature: + +1. **Generates `~/CLAUDE.md`** - Claude Code auto-discovers this file on startup +2. **Provides workspace context** - Name, ID, role, resources, cloud paths +3. **Includes skill files** - Detailed guides (e.g., custom app creation) in `~/.workbench/skills/` +4. **Sets up aliases** - `generate-llm-context`, `refresh-context` ## What's in `~/CLAUDE.md` - **Quick Rules** - When to use this file vs. MCP/CLI - **Current Workspace** - Name, ID, description, role, cloud platform -- **Resource Paths** - JSON lookup for all resources +- **Resource Paths** - JSON lookup for all resources (GCS, BigQuery, etc.) - **Data Persistence** - Warning + save commands - **Data Exploration** - Common BigQuery/GCS commands - **MCP Tools** - Available tools and CLI equivalents - **Skills** - Links to detailed guides -## Dependencies +## When Context Gets Generated + +1. **On first terminal open** - Via `.bashrc` trigger (runs in background) +2. **Manually** - Run `generate-llm-context` or `refresh-context` + +## MCP Integration + +This feature works well alongside the `wb-mcp-server` feature: +- **`llm-context`** provides static context (workspace info, resource paths) +- **`wb-mcp-server`** provides dynamic tools (search, create, modify) -- **Workbench CLI (`wb`)** - Must be installed and authenticated -- **jq** - Installed automatically if missing +For optimal LLM experience, use both: + +```json +"features": { + "./.devcontainer/features/llm-context": {}, + "./.devcontainer/features/wb-mcp-server": {} +} +``` ## Troubleshooting @@ -96,7 +100,18 @@ head ~/CLAUDE.md | File | Purpose | |------|---------| -| `/opt/llm-context/generate-context.sh` | Main script | -| `~/.workbench/CLAUDE.md` | Generated context | -| `~/CLAUDE.md` | Symlink (for auto-discovery) | -| `~/.workbench/skills/` | Skill files | +| `/opt/llm-context/generate-context.sh` | Main generation script | +| `/opt/llm-context/run-context-generator.sh` | Auto-run wrapper | +| `~/.workbench/CLAUDE.md` | Generated context (primary) | +| `~/CLAUDE.md` | Symlink for auto-discovery | +| `~/.workbench/skills/` | Skill files (e.g., CUSTOM_APP.md) | + +## Notes + +- This feature requires the Workbench CLI (`wb`) to be installed +- `jq` is automatically installed if not present +- Context is only generated if a workspace is set (`wb workspace describe` succeeds) + +--- + +_Note: This feature is automatically configured to work with the `wb-mcp-server` feature if both are installed._ diff --git a/features/src/llm-context/devcontainer-feature.json b/features/src/llm-context/devcontainer-feature.json index 117a524b3..e052c3936 100644 --- a/features/src/llm-context/devcontainer-feature.json +++ b/features/src/llm-context/devcontainer-feature.json @@ -1,25 +1,22 @@ { "id": "llm-context", - "version": "1.1.0", + "version": "1.2.0", "name": "LLM Context Generator", "description": "Generates ~/CLAUDE.md context file for LLMs (Claude Code, Gemini, etc.) with Workbench workspace information. Claude Code auto-discovers this file on startup.", - "documentationURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/llm-context-feature/features/src/llm-context", "options": { "username": { "type": "string", "default": "root", - "description": "Username for the container user" + "description": "Username of the container user." }, "userHomeDir": { "type": "string", - "default": "", - "description": "Home directory for the container user (auto-detected if empty)" + "default": "/root", + "description": "Home directory of the container user." } }, - "containerEnv": { - "LLM_CONTEXT_ENABLED": "true" - }, "installsAfter": [ - "ghcr.io/verily-src/workbench-app-devcontainers/workbench-tools" + "ghcr.io/devcontainers/features/common-utils", + "./.devcontainer/features/workbench-tools" ] } diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh index 388e43228..00f2fab96 100644 --- a/features/src/llm-context/install.sh +++ b/features/src/llm-context/install.sh @@ -1,7 +1,6 @@ #!/usr/bin/env bash -# install.sh - Installs the LLM Context Generator for Workbench apps -# +# install.sh installs the LLM Context Generator in the devcontainer. # This feature generates a CLAUDE.md file that provides LLMs (like Claude Code) # with context about the current Workbench workspace, resources, and tools. # Claude Code auto-discovers ~/CLAUDE.md on startup. @@ -13,41 +12,59 @@ set -o xtrace # Options from devcontainer-feature.json (converted to uppercase) readonly USERNAME="${USERNAME:-"root"}" -USER_HOME_DIR="${USERHOMEDIR:-""}" -if [[ -z "${USER_HOME_DIR}" ]]; then - if [[ "${USERNAME}" == "root" ]]; then - USER_HOME_DIR="/root" - else - USER_HOME_DIR="/home/${USERNAME}" - fi +USER_HOME_DIR="${USERHOMEDIR:-"/home/${USERNAME}"}" +if [[ "${USER_HOME_DIR}" == "/home/root" ]]; then + USER_HOME_DIR="/root" fi readonly USER_HOME_DIR -echo "Installing LLM Context Generator for user: ${USERNAME} (home: ${USER_HOME_DIR})" - export DEBIAN_FRONTEND=noninteractive export TZ=Etc/UTC +WORKDIR="$(mktemp -d)" +readonly WORKDIR + readonly LLM_CONTEXT_DIR="/opt/llm-context" readonly GENERATE_SCRIPT="${LLM_CONTEXT_DIR}/generate-context.sh" +function cleanup() { + rm -rf "${WORKDIR:?}" + rm -rf /var/lib/apt/lists/* +} + +trap 'cleanup' EXIT + +function apt_get_update() { + if [ "$(find /var/lib/apt/lists/* | wc -l)" = "0" ]; then + echo "Running apt-get update..." + apt-get update -y + fi +} + +# Checks if packages are installed and installs them if not +function check_packages() { + if ! dpkg -s "$@" > /dev/null 2>&1; then + apt_get_update + apt-get -y install --no-install-recommends "$@" + fi +} + +echo "Starting LLM Context Generator installation..." +echo "User: ${USERNAME}, Home: ${USER_HOME_DIR}" + # Save the directory where the feature files are located FEATURE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" readonly FEATURE_DIR -echo "Starting LLM Context Generator installation..." -echo "Feature directory: ${FEATURE_DIR}" - -# Install jq if not present (required for JSON processing) -if ! command -v jq &> /dev/null; then - echo "Installing jq..." - if command -v apt-get &> /dev/null; then - apt-get update && apt-get install -y --no-install-recommends jq - elif command -v apk &> /dev/null; then - apk add --no-cache jq - else - echo "WARNING: Could not install jq. Please install it manually." - fi +# Check for supported package manager +if type apt-get &>/dev/null; then + # Install jq if not present (required for JSON processing) + check_packages jq +elif type apk &>/dev/null; then + # Alpine Linux + apk add --no-cache jq +else + echo "Warning: Could not install jq. Please install it manually." fi # Create installation directory @@ -92,20 +109,21 @@ chown -R "${USERNAME}:" "${LLM_CONTEXT_DIR}" 2>/dev/null || true chown -R "${USERNAME}:" "${USER_WORKBENCH_DIR}" 2>/dev/null || true # Add aliases and auto-run trigger to bashrc -cat >> "${USER_HOME_DIR}/.bashrc" << 'BASHRC_EOF' - -# LLM Context Generator -export LLM_CONTEXT_ENABLED=true -alias generate-llm-context='/opt/llm-context/generate-context.sh' -alias refresh-context='/opt/llm-context/generate-context.sh' - -# Auto-generate context on first interactive shell (if not already done) -if [[ -z "${LLM_CONTEXT_GENERATED:-}" ]] && [[ -f /opt/llm-context/run-context-generator.sh ]]; then - export LLM_CONTEXT_GENERATED=1 - /opt/llm-context/run-context-generator.sh & -fi -BASHRC_EOF - +{ + echo "" + echo "# LLM Context Generator" + echo "export LLM_CONTEXT_ENABLED=true" + echo "alias generate-llm-context='${GENERATE_SCRIPT}'" + echo "alias refresh-context='${GENERATE_SCRIPT}'" + echo "" + echo "# Auto-generate context on first interactive shell (if not already done)" + echo 'if [[ -z "${LLM_CONTEXT_GENERATED:-}" ]] && [[ -f /opt/llm-context/run-context-generator.sh ]]; then' + echo ' export LLM_CONTEXT_GENERATED=1' + echo ' /opt/llm-context/run-context-generator.sh &' + echo 'fi' +} >> "${USER_HOME_DIR}/.bashrc" + +# Make sure the login user is the owner of their .bashrc chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" 2>/dev/null || true echo "" From 6c871a4b41ccd048a4af5a2830595e83380eeae2 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 11 Feb 2026 14:08:54 -0500 Subject: [PATCH 11/86] Fix feature paths - move symlinks to repo root .devcontainer/features/ Features referenced as ./.devcontainer/features/xxx in .devcontainer.json are resolved from repo root, not the app folder. This matches the original NavidZ repo structure. - Created .devcontainer/features/ at repo root with symlinks to features/src/ - Removed .devcontainer/features/ from app folder (was incorrect location) --- src/workbench-jupyter-with-llm/.devcontainer/features/gemini | 1 - .../.devcontainer/features/llm-context | 1 - .../.devcontainer/features/wb-mcp-server | 1 - .../.devcontainer/features/workbench-tools | 1 - 4 files changed, 4 deletions(-) delete mode 120000 src/workbench-jupyter-with-llm/.devcontainer/features/gemini delete mode 120000 src/workbench-jupyter-with-llm/.devcontainer/features/llm-context delete mode 120000 src/workbench-jupyter-with-llm/.devcontainer/features/wb-mcp-server delete mode 120000 src/workbench-jupyter-with-llm/.devcontainer/features/workbench-tools diff --git a/src/workbench-jupyter-with-llm/.devcontainer/features/gemini b/src/workbench-jupyter-with-llm/.devcontainer/features/gemini deleted file mode 120000 index e4b40bcad..000000000 --- a/src/workbench-jupyter-with-llm/.devcontainer/features/gemini +++ /dev/null @@ -1 +0,0 @@ -../../../features/src/gemini \ No newline at end of file diff --git a/src/workbench-jupyter-with-llm/.devcontainer/features/llm-context b/src/workbench-jupyter-with-llm/.devcontainer/features/llm-context deleted file mode 120000 index 02cc572ec..000000000 --- a/src/workbench-jupyter-with-llm/.devcontainer/features/llm-context +++ /dev/null @@ -1 +0,0 @@ -../../../features/src/llm-context \ No newline at end of file diff --git a/src/workbench-jupyter-with-llm/.devcontainer/features/wb-mcp-server b/src/workbench-jupyter-with-llm/.devcontainer/features/wb-mcp-server deleted file mode 120000 index 15cfa125b..000000000 --- a/src/workbench-jupyter-with-llm/.devcontainer/features/wb-mcp-server +++ /dev/null @@ -1 +0,0 @@ -../../../features/src/wb-mcp-server \ No newline at end of file diff --git a/src/workbench-jupyter-with-llm/.devcontainer/features/workbench-tools b/src/workbench-jupyter-with-llm/.devcontainer/features/workbench-tools deleted file mode 120000 index 157a0cbfa..000000000 --- a/src/workbench-jupyter-with-llm/.devcontainer/features/workbench-tools +++ /dev/null @@ -1 +0,0 @@ -../../../features/src/workbench-tools \ No newline at end of file From 21c35cbdece4818d520a78c5c16b2b20a0918e28 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 11 Feb 2026 15:33:42 -0500 Subject: [PATCH 12/86] Fix bucket mounting and context generation timing Issues fixed: 1. Bucket mounting: Removed startupscript symlink from app folder. Paths like ./startupscript/ resolve from repo root, not app folder. 2. Context generation timing: Now runs via postStartCommand AFTER startup scripts complete (auth + workspace setup done first). Removed .bashrc auto-trigger which ran too early. Changes: - Removed src/workbench-jupyter-with-llm/startupscript symlink - Updated .devcontainer.json postStartCommand to run context generation - Simplified install.sh (aliases only, no bashrc auto-trigger) - Updated README with correct integration instructions --- features/src/llm-context/README.md | 14 +++++++++++++- features/src/llm-context/install.sh | 13 +++---------- src/workbench-jupyter-with-llm/.devcontainer.json | 10 ++++------ src/workbench-jupyter-with-llm/startupscript | 1 - 4 files changed, 20 insertions(+), 18 deletions(-) delete mode 120000 src/workbench-jupyter-with-llm/startupscript diff --git a/features/src/llm-context/README.md b/features/src/llm-context/README.md index 1cd416e37..8d619b29f 100644 --- a/features/src/llm-context/README.md +++ b/features/src/llm-context/README.md @@ -52,9 +52,21 @@ When installed, this feature: ## When Context Gets Generated -1. **On first terminal open** - Via `.bashrc` trigger (runs in background) +1. **Automatically on app start** - Via `postStartCommand` (after bucket mounting completes) 2. **Manually** - Run `generate-llm-context` or `refresh-context` +**Important**: Add the context generation to your `postStartCommand` in `.devcontainer.json`: + +```json +"postStartCommand": [ + "bash", + "-c", + "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && /opt/llm-context/generate-context.sh" +] +``` + +This ensures context is generated AFTER authentication and workspace setup complete. + ## MCP Integration This feature works well alongside the `wb-mcp-server` feature: diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh index 00f2fab96..227f16cab 100644 --- a/features/src/llm-context/install.sh +++ b/features/src/llm-context/install.sh @@ -108,19 +108,13 @@ chmod +x "${LLM_CONTEXT_DIR}/run-context-generator.sh" chown -R "${USERNAME}:" "${LLM_CONTEXT_DIR}" 2>/dev/null || true chown -R "${USERNAME}:" "${USER_WORKBENCH_DIR}" 2>/dev/null || true -# Add aliases and auto-run trigger to bashrc +# Add aliases to bashrc (context generation is triggered by postStartCommand, not bashrc) { echo "" echo "# LLM Context Generator" echo "export LLM_CONTEXT_ENABLED=true" echo "alias generate-llm-context='${GENERATE_SCRIPT}'" echo "alias refresh-context='${GENERATE_SCRIPT}'" - echo "" - echo "# Auto-generate context on first interactive shell (if not already done)" - echo 'if [[ -z "${LLM_CONTEXT_GENERATED:-}" ]] && [[ -f /opt/llm-context/run-context-generator.sh ]]; then' - echo ' export LLM_CONTEXT_GENERATED=1' - echo ' /opt/llm-context/run-context-generator.sh &' - echo 'fi' } >> "${USER_HOME_DIR}/.bashrc" # Make sure the login user is the owner of their .bashrc @@ -134,9 +128,8 @@ echo "" echo "Installed to: ${LLM_CONTEXT_DIR}" echo "User home: ${USER_HOME_DIR}" echo "" -echo "Context will auto-generate when:" -echo " 1. A terminal is opened (via .bashrc)" -echo " 2. You run 'generate-llm-context' or 'refresh-context'" +echo "Context will be generated via postStartCommand after startup completes." +echo "Manual refresh: run 'generate-llm-context' or 'refresh-context'" echo "" echo "Claude Code will auto-discover ~/CLAUDE.md" echo "==========================================" diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json index 201aceb82..16a52e542 100644 --- a/src/workbench-jupyter-with-llm/.devcontainer.json +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -10,13 +10,11 @@ "-c", "./startupscript/post-startup.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && echo '' >> /home/jupyter/.bashrc && echo '# Unset GOOGLE_CLOUD_PROJECT for Gemini CLI compatibility' >> /home/jupyter/.bashrc && echo 'unset GOOGLE_CLOUD_PROJECT' >> /home/jupyter/.bashrc" ], - // re-mount bucket files on container start up + // re-mount bucket files on container start up, then generate LLM context "postStartCommand": [ - "./startupscript/remount-on-restart.sh", - "jupyter", - "/home/jupyter", - "${templateOption:cloud}", - "${templateOption:login}" + "bash", + "-c", + "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && /opt/llm-context/generate-context.sh" ], "features": { "./.devcontainer/features/workbench-tools": { diff --git a/src/workbench-jupyter-with-llm/startupscript b/src/workbench-jupyter-with-llm/startupscript deleted file mode 120000 index 44271b0fb..000000000 --- a/src/workbench-jupyter-with-llm/startupscript +++ /dev/null @@ -1 +0,0 @@ -../../startupscript \ No newline at end of file From d402ce8607deb5a154f93e000df1375cb997e4ef Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 11 Feb 2026 16:25:55 -0500 Subject: [PATCH 13/86] Fix CLAUDE.md being created in wrong home directory Bug: postStartCommand runs as root, so $HOME = /root. The script was creating /root/.workbench/CLAUDE.md instead of /home/jupyter/.workbench/CLAUDE.md Fix: generate-context.sh now accepts home directory as first argument. Priority: 1) $LLM_CONTEXT_HOME, 2) first arg, 3) /home/jupyter fallback, 4) $HOME Updated: - generate-context.sh: Accept home dir arg, smart fallback to /home/jupyter - .devcontainer.json: Pass /home/jupyter to generate-context.sh - install.sh: Set LLM_CONTEXT_HOME env var, aliases pass home dir - README.md: Document the home directory argument --- features/src/llm-context/README.md | 4 +++- features/src/llm-context/generate-context.sh | 19 +++++++++++++++++-- features/src/llm-context/install.sh | 7 ++++--- .../.devcontainer.json | 2 +- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/features/src/llm-context/README.md b/features/src/llm-context/README.md index 8d619b29f..2c04a695b 100644 --- a/features/src/llm-context/README.md +++ b/features/src/llm-context/README.md @@ -61,10 +61,12 @@ When installed, this feature: "postStartCommand": [ "bash", "-c", - "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && /opt/llm-context/generate-context.sh" + "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && /opt/llm-context/generate-context.sh /home/jupyter" ] ``` +**Note**: Pass the user home directory (e.g., `/home/jupyter`) as an argument because `postStartCommand` runs as root, not as the container user. + This ensures context is generated AFTER authentication and workspace setup complete. ## MCP Integration diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index f53bc5a5d..9b3a8c155 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -48,12 +48,27 @@ set -e +# Determine user home directory +# Priority: 1) $LLM_CONTEXT_HOME, 2) first arg, 3) $HOME +if [[ -n "${LLM_CONTEXT_HOME:-}" ]]; then + USER_HOME="${LLM_CONTEXT_HOME}" +elif [[ -n "${1:-}" ]]; then + USER_HOME="$1" +else + # Find the primary non-root user's home (typically jupyter) + if [[ -d "/home/jupyter" ]]; then + USER_HOME="/home/jupyter" + else + USER_HOME="${HOME}" + fi +fi + # Configuration -CONTEXT_DIR="${HOME}/.workbench" +CONTEXT_DIR="${USER_HOME}/.workbench" SKILLS_DIR="${CONTEXT_DIR}/skills" CLAUDE_FILE="${CONTEXT_DIR}/CLAUDE.md" # Visible symlink in home directory for Claude Code auto-discovery -VISIBLE_CLAUDE_SYMLINK="${HOME}/CLAUDE.md" +VISIBLE_CLAUDE_SYMLINK="${USER_HOME}/CLAUDE.md" # Colors for output RED='\033[0;31m' diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh index 227f16cab..c541a69c6 100644 --- a/features/src/llm-context/install.sh +++ b/features/src/llm-context/install.sh @@ -108,13 +108,14 @@ chmod +x "${LLM_CONTEXT_DIR}/run-context-generator.sh" chown -R "${USERNAME}:" "${LLM_CONTEXT_DIR}" 2>/dev/null || true chown -R "${USERNAME}:" "${USER_WORKBENCH_DIR}" 2>/dev/null || true -# Add aliases to bashrc (context generation is triggered by postStartCommand, not bashrc) +# Add aliases and environment to bashrc (context generation is triggered by postStartCommand, not bashrc) { echo "" echo "# LLM Context Generator" echo "export LLM_CONTEXT_ENABLED=true" - echo "alias generate-llm-context='${GENERATE_SCRIPT}'" - echo "alias refresh-context='${GENERATE_SCRIPT}'" + echo "export LLM_CONTEXT_HOME=\"${USER_HOME_DIR}\"" + echo "alias generate-llm-context='${GENERATE_SCRIPT} ${USER_HOME_DIR}'" + echo "alias refresh-context='${GENERATE_SCRIPT} ${USER_HOME_DIR}'" } >> "${USER_HOME_DIR}/.bashrc" # Make sure the login user is the owner of their .bashrc diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json index 16a52e542..b7e9723ca 100644 --- a/src/workbench-jupyter-with-llm/.devcontainer.json +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -14,7 +14,7 @@ "postStartCommand": [ "bash", "-c", - "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && /opt/llm-context/generate-context.sh" + "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && /opt/llm-context/generate-context.sh /home/jupyter" ], "features": { "./.devcontainer/features/workbench-tools": { From d68b7e64df42baad1fd7d86e03f85a8ed52faae5 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 11 Feb 2026 21:43:58 -0500 Subject: [PATCH 14/86] Fix: Use semicolon instead of && to ensure context script runs The remount-on-restart.sh script may return non-zero exit code even when successful, which breaks the && chain. Changed to: - Use ; instead of && (run regardless of previous exit code) - Added || true to prevent postStartCommand failure --- src/workbench-jupyter-with-llm/.devcontainer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json index b7e9723ca..2aafedcb3 100644 --- a/src/workbench-jupyter-with-llm/.devcontainer.json +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -14,7 +14,7 @@ "postStartCommand": [ "bash", "-c", - "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && /opt/llm-context/generate-context.sh /home/jupyter" + "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\"; /opt/llm-context/generate-context.sh /home/jupyter || true" ], "features": { "./.devcontainer/features/workbench-tools": { From bdf72fb653dcb657ddf7d60169dc401f80d5e988 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 11 Feb 2026 22:03:27 -0500 Subject: [PATCH 15/86] Enhance CUSTOM_APP.md skill with comprehensive validation checklist Added: - Detailed validation table with specific files and what to check - Common mistakes section - Validation commands to run before deploy - LLM response template for consistent validation output - Clear instruction for LLM to verify ALL items before suggesting deploy --- features/src/llm-context/generate-context.sh | 66 +++++++++++++++++--- 1 file changed, 57 insertions(+), 9 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 9b3a8c155..261741b3d 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -148,15 +148,63 @@ Do you need Jupyter notebooks? ## ✅ Pre-Deploy Checklist -Before deploying, verify: - -- [ ] Container is named `application-server` -- [ ] Connected to `app-network` (external: true) -- [ ] HTTP server binds to `0.0.0.0` (not `localhost`) -- [ ] Port is exposed (usually 8080) -- [ ] No syntax errors in `.devcontainer.json` (valid JSON, no trailing commas) -- [ ] `devcontainer-template.json` exists with valid `id` and `name` -- [ ] Test locally with `docker compose up` before deploying +**🤖 LLM INSTRUCTION: Before suggesting deployment, verify ALL items below. Read the user's files and check each one.** + +### Critical (App Won't Start Without These) + +| Check | File | What to Look For | +|-------|------|------------------| +| ✅ Container name | `docker-compose.yaml` | `container_name: "application-server"` | +| ✅ Network config | `docker-compose.yaml` | `networks: [app-network]` and `app-network: {external: true}` | +| ✅ Port exposed | `Dockerfile` | `EXPOSE 8080` (or your port) | +| ✅ Port mapped | `docker-compose.yaml` | `ports: ["8080:8080"]` | +| ✅ HTTP binds correctly | App code | `host='0.0.0.0'` (NOT `localhost` or `127.0.0.1`) | +| ✅ Template exists | `devcontainer-template.json` | Has `id` and `name` fields | + +### Common Mistakes (Check These Too) + +| Check | File | Issue | +|-------|------|-------| +| ⚠️ Valid JSON | `.devcontainer.json` | No trailing commas, no comments in JSON | +| ⚠️ Build context | `docker-compose.yaml` | `context: ../..` if app is in `src/app-name/` | +| ⚠️ Dockerfile path | `docker-compose.yaml` | `dockerfile: src/YOUR-APP/Dockerfile` | +| ⚠️ Long-running process | `Dockerfile` CMD | Must run server, not a script that exits | +| ⚠️ Dependencies installed | `Dockerfile` | All pip/npm packages in requirements | + +### Validation Commands (Run Before Deploy) + +```bash +# 1. Check JSON syntax +python3 -c "import json; json.load(open('.devcontainer.json'))" && echo "✅ Valid JSON" + +# 2. Check docker-compose syntax +docker compose config > /dev/null && echo "✅ Valid docker-compose" + +# 3. Test locally +docker network create app-network 2>/dev/null || true +docker compose build && docker compose up +``` + +### 🤖 LLM Response Template + +When user asks to create/validate an app, respond with: + +``` +## App Validation Results + +| Check | Status | Details | +|-------|--------|---------| +| Container name | ✅/❌ | Found: "xxx" | +| Network config | ✅/❌ | ... | +| Port exposed | ✅/❌ | ... | +| ... | ... | ... | + +### Issues Found +1. [Issue description and fix] + +### Ready to Deploy? +[Yes/No and next steps] +``` --- From 98c6d2b5d36a96b19ab5b327c268997abdcb1201 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 11 Feb 2026 22:59:24 -0500 Subject: [PATCH 16/86] Move context generation to postCreateCommand (after auth) The postStartCommand was running but context generation wasn't working. Moving it to postCreateCommand ensures it runs AFTER post-startup.sh completes authentication and workspace setup. Also added echo statements to make it visible in logs. --- src/workbench-jupyter-with-llm/.devcontainer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json index 2aafedcb3..a2bd78ac3 100644 --- a/src/workbench-jupyter-with-llm/.devcontainer.json +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -8,7 +8,7 @@ "postCreateCommand": [ "bash", "-c", - "./startupscript/post-startup.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && echo '' >> /home/jupyter/.bashrc && echo '# Unset GOOGLE_CLOUD_PROJECT for Gemini CLI compatibility' >> /home/jupyter/.bashrc && echo 'unset GOOGLE_CLOUD_PROJECT' >> /home/jupyter/.bashrc" + "./startupscript/post-startup.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && echo '' >> /home/jupyter/.bashrc && echo '# Unset GOOGLE_CLOUD_PROJECT for Gemini CLI compatibility' >> /home/jupyter/.bashrc && echo 'unset GOOGLE_CLOUD_PROJECT' >> /home/jupyter/.bashrc && echo '[LLM Context] Generating context...' && /opt/llm-context/generate-context.sh /home/jupyter && echo '[LLM Context] Done!'" ], // re-mount bucket files on container start up, then generate LLM context "postStartCommand": [ From d68d4559bbef9d376d1d6fa93a01268b0a4a57b1 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 12 Feb 2026 00:25:06 -0500 Subject: [PATCH 17/86] Create stub CLAUDE.md during feature install (like wb-mcp-server) Learning from wb-mcp-server: create the file during install.sh so Claude finds it immediately. The stub: - Tells Claude it's in Workbench - Instructs to run 'generate-llm-context' for full context - Lists available MCP tools - Provides basic CLI commands postCreateCommand still tries to generate full context, but if it fails, Claude has the stub to work with. This ensures ~/CLAUDE.md exists as soon as the container starts. --- features/src/llm-context/install.sh | 54 ++++++++++++++++++- .../.devcontainer.json | 2 +- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh index c541a69c6..753904a65 100644 --- a/features/src/llm-context/install.sh +++ b/features/src/llm-context/install.sh @@ -108,7 +108,7 @@ chmod +x "${LLM_CONTEXT_DIR}/run-context-generator.sh" chown -R "${USERNAME}:" "${LLM_CONTEXT_DIR}" 2>/dev/null || true chown -R "${USERNAME}:" "${USER_WORKBENCH_DIR}" 2>/dev/null || true -# Add aliases and environment to bashrc (context generation is triggered by postStartCommand, not bashrc) +# Add aliases and environment to bashrc { echo "" echo "# LLM Context Generator" @@ -121,6 +121,58 @@ chown -R "${USERNAME}:" "${USER_WORKBENCH_DIR}" 2>/dev/null || true # Make sure the login user is the owner of their .bashrc chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" 2>/dev/null || true +# Create a stub CLAUDE.md that Claude Code will find immediately +# This tells Claude basic info and how to get full context +cat > "${USER_HOME_DIR}/CLAUDE.md" << 'STUB_EOF' +# Workbench Context (Stub) + +You are in a **Verily Workbench** app. Full workspace context is not yet generated. + +## ⚡ FIRST: Generate Full Context + +Run this command to get complete workspace information: + +```bash +generate-llm-context +``` + +This will create the full context with: +- Workspace name, ID, and your role +- All resources (buckets, datasets, repos) +- Resource paths for quick lookup +- Data persistence guidance +- Available MCP tools + +## While You Wait + +You can also use these tools: + +```bash +# Check current workspace +wb workspace describe + +# List resources +wb resource list + +# Check authentication +wb auth status +``` + +## MCP Tools Available + +The Workbench MCP server is available with tools like: +- `list_resources` - List workspace resources +- `get_resource` - Get resource details +- `query_bigquery` - Run SQL queries + +--- + +*Run `generate-llm-context` to replace this stub with full context.* +STUB_EOF + +chown "${USERNAME}:" "${USER_HOME_DIR}/CLAUDE.md" 2>/dev/null || true +echo "Created stub CLAUDE.md at ${USER_HOME_DIR}/CLAUDE.md" + echo "" echo "==========================================" echo "LLM Context Generator installation complete!" diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json index a2bd78ac3..b7d04e765 100644 --- a/src/workbench-jupyter-with-llm/.devcontainer.json +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -8,7 +8,7 @@ "postCreateCommand": [ "bash", "-c", - "./startupscript/post-startup.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && echo '' >> /home/jupyter/.bashrc && echo '# Unset GOOGLE_CLOUD_PROJECT for Gemini CLI compatibility' >> /home/jupyter/.bashrc && echo 'unset GOOGLE_CLOUD_PROJECT' >> /home/jupyter/.bashrc && echo '[LLM Context] Generating context...' && /opt/llm-context/generate-context.sh /home/jupyter && echo '[LLM Context] Done!'" + "./startupscript/post-startup.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && echo '' >> /home/jupyter/.bashrc && echo '# Unset GOOGLE_CLOUD_PROJECT for Gemini CLI compatibility' >> /home/jupyter/.bashrc && echo 'unset GOOGLE_CLOUD_PROJECT' >> /home/jupyter/.bashrc && /opt/llm-context/generate-context.sh /home/jupyter 2>/dev/null || true" ], // re-mount bucket files on container start up, then generate LLM context "postStartCommand": [ From 3a9c06c80815fc4d47248cc977e29fff59f93a8d Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 12 Feb 2026 00:29:05 -0500 Subject: [PATCH 18/86] Add LLM context generation to post-startup.sh (uses feature) Architecture: - llm-context feature installs script to /opt/llm-context/ - post-startup.sh checks if feature is installed - If yes, runs generate-context.sh AFTER auth is complete - Uses RUN_AS_LOGIN_USER for correct file ownership This ensures context generation runs at the right time (after auth) while still using the devcontainer feature for installation. Changes: - startupscript/post-startup.sh: Added LLM context generation section - Removed stub CLAUDE.md from install.sh (not needed) - Simplified postCreateCommand - postStartCommand still runs context gen for app restarts --- features/src/llm-context/install.sh | 52 ------------------- .../.devcontainer.json | 2 +- 2 files changed, 1 insertion(+), 53 deletions(-) diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh index 753904a65..54685df37 100644 --- a/features/src/llm-context/install.sh +++ b/features/src/llm-context/install.sh @@ -121,58 +121,6 @@ chown -R "${USERNAME}:" "${USER_WORKBENCH_DIR}" 2>/dev/null || true # Make sure the login user is the owner of their .bashrc chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" 2>/dev/null || true -# Create a stub CLAUDE.md that Claude Code will find immediately -# This tells Claude basic info and how to get full context -cat > "${USER_HOME_DIR}/CLAUDE.md" << 'STUB_EOF' -# Workbench Context (Stub) - -You are in a **Verily Workbench** app. Full workspace context is not yet generated. - -## ⚡ FIRST: Generate Full Context - -Run this command to get complete workspace information: - -```bash -generate-llm-context -``` - -This will create the full context with: -- Workspace name, ID, and your role -- All resources (buckets, datasets, repos) -- Resource paths for quick lookup -- Data persistence guidance -- Available MCP tools - -## While You Wait - -You can also use these tools: - -```bash -# Check current workspace -wb workspace describe - -# List resources -wb resource list - -# Check authentication -wb auth status -``` - -## MCP Tools Available - -The Workbench MCP server is available with tools like: -- `list_resources` - List workspace resources -- `get_resource` - Get resource details -- `query_bigquery` - Run SQL queries - ---- - -*Run `generate-llm-context` to replace this stub with full context.* -STUB_EOF - -chown "${USERNAME}:" "${USER_HOME_DIR}/CLAUDE.md" 2>/dev/null || true -echo "Created stub CLAUDE.md at ${USER_HOME_DIR}/CLAUDE.md" - echo "" echo "==========================================" echo "LLM Context Generator installation complete!" diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json index b7d04e765..2aafedcb3 100644 --- a/src/workbench-jupyter-with-llm/.devcontainer.json +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -8,7 +8,7 @@ "postCreateCommand": [ "bash", "-c", - "./startupscript/post-startup.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && echo '' >> /home/jupyter/.bashrc && echo '# Unset GOOGLE_CLOUD_PROJECT for Gemini CLI compatibility' >> /home/jupyter/.bashrc && echo 'unset GOOGLE_CLOUD_PROJECT' >> /home/jupyter/.bashrc && /opt/llm-context/generate-context.sh /home/jupyter 2>/dev/null || true" + "./startupscript/post-startup.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && echo '' >> /home/jupyter/.bashrc && echo '# Unset GOOGLE_CLOUD_PROJECT for Gemini CLI compatibility' >> /home/jupyter/.bashrc && echo 'unset GOOGLE_CLOUD_PROJECT' >> /home/jupyter/.bashrc" ], // re-mount bucket files on container start up, then generate LLM context "postStartCommand": [ From 27c7b986d1cef54f756c953967dc267aefa9b3db Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 12 Feb 2026 16:18:55 -0500 Subject: [PATCH 19/86] Add app templates and APP_TEMPLATES skill Based on working llm-context-feature branch, adds: - 4 app templates (flask-api, streamlit-dashboard, rshiny-dashboard, file-processor) - APP_TEMPLATES.md skill for template selection guidance - Updated generate-context.sh with both skills embedded - Updated CLAUDE.md template with decision flow for app creation --- features/src/llm-context/generate-context.sh | 489 +++++++++++------- .../src/llm-context/skills/APP_TEMPLATES.md | 261 ++++++++++ features/src/llm-context/templates/README.md | 100 ++++ .../file-processor/.devcontainer.json | 35 ++ .../templates/file-processor/Dockerfile | 24 + .../templates/file-processor/README.md | 64 +++ .../templates/file-processor/app/main.py | 450 ++++++++++++++++ .../file-processor/app/requirements.txt | 8 + .../file-processor/devcontainer-template.json | 23 + .../file-processor/docker-compose.yaml | 32 ++ .../templates/file-processor/manifest.yaml | 33 ++ .../templates/flask-api/.devcontainer.json | 35 ++ .../templates/flask-api/Dockerfile | 32 ++ .../llm-context/templates/flask-api/README.md | 48 ++ .../templates/flask-api/app/main.py | 201 +++++++ .../templates/flask-api/app/requirements.txt | 6 + .../flask-api/devcontainer-template.json | 23 + .../templates/flask-api/docker-compose.yaml | 30 ++ .../templates/flask-api/manifest.yaml | 41 ++ .../rshiny-dashboard/.devcontainer.json | 35 ++ .../templates/rshiny-dashboard/Dockerfile | 40 ++ .../templates/rshiny-dashboard/README.md | 72 +++ .../templates/rshiny-dashboard/app/app.R | 222 ++++++++ .../devcontainer-template.json | 23 + .../rshiny-dashboard/docker-compose.yaml | 29 ++ .../templates/rshiny-dashboard/manifest.yaml | 39 ++ .../rshiny-dashboard/shiny-server.conf | 14 + .../streamlit-dashboard/.devcontainer.json | 35 ++ .../templates/streamlit-dashboard/Dockerfile | 19 + .../templates/streamlit-dashboard/README.md | 43 ++ .../templates/streamlit-dashboard/app/main.py | 183 +++++++ .../streamlit-dashboard/app/requirements.txt | 7 + .../devcontainer-template.json | 23 + .../streamlit-dashboard/docker-compose.yaml | 31 ++ .../streamlit-dashboard/manifest.yaml | 39 ++ 35 files changed, 2589 insertions(+), 200 deletions(-) create mode 100644 features/src/llm-context/skills/APP_TEMPLATES.md create mode 100644 features/src/llm-context/templates/README.md create mode 100644 features/src/llm-context/templates/file-processor/.devcontainer.json create mode 100644 features/src/llm-context/templates/file-processor/Dockerfile create mode 100644 features/src/llm-context/templates/file-processor/README.md create mode 100644 features/src/llm-context/templates/file-processor/app/main.py create mode 100644 features/src/llm-context/templates/file-processor/app/requirements.txt create mode 100644 features/src/llm-context/templates/file-processor/devcontainer-template.json create mode 100644 features/src/llm-context/templates/file-processor/docker-compose.yaml create mode 100644 features/src/llm-context/templates/file-processor/manifest.yaml create mode 100644 features/src/llm-context/templates/flask-api/.devcontainer.json create mode 100644 features/src/llm-context/templates/flask-api/Dockerfile create mode 100644 features/src/llm-context/templates/flask-api/README.md create mode 100644 features/src/llm-context/templates/flask-api/app/main.py create mode 100644 features/src/llm-context/templates/flask-api/app/requirements.txt create mode 100644 features/src/llm-context/templates/flask-api/devcontainer-template.json create mode 100644 features/src/llm-context/templates/flask-api/docker-compose.yaml create mode 100644 features/src/llm-context/templates/flask-api/manifest.yaml create mode 100644 features/src/llm-context/templates/rshiny-dashboard/.devcontainer.json create mode 100644 features/src/llm-context/templates/rshiny-dashboard/Dockerfile create mode 100644 features/src/llm-context/templates/rshiny-dashboard/README.md create mode 100644 features/src/llm-context/templates/rshiny-dashboard/app/app.R create mode 100644 features/src/llm-context/templates/rshiny-dashboard/devcontainer-template.json create mode 100644 features/src/llm-context/templates/rshiny-dashboard/docker-compose.yaml create mode 100644 features/src/llm-context/templates/rshiny-dashboard/manifest.yaml create mode 100644 features/src/llm-context/templates/rshiny-dashboard/shiny-server.conf create mode 100644 features/src/llm-context/templates/streamlit-dashboard/.devcontainer.json create mode 100644 features/src/llm-context/templates/streamlit-dashboard/Dockerfile create mode 100644 features/src/llm-context/templates/streamlit-dashboard/README.md create mode 100644 features/src/llm-context/templates/streamlit-dashboard/app/main.py create mode 100644 features/src/llm-context/templates/streamlit-dashboard/app/requirements.txt create mode 100644 features/src/llm-context/templates/streamlit-dashboard/devcontainer-template.json create mode 100644 features/src/llm-context/templates/streamlit-dashboard/docker-compose.yaml create mode 100644 features/src/llm-context/templates/streamlit-dashboard/manifest.yaml diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 261741b3d..593c51bcd 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -48,27 +48,12 @@ set -e -# Determine user home directory -# Priority: 1) $LLM_CONTEXT_HOME, 2) first arg, 3) $HOME -if [[ -n "${LLM_CONTEXT_HOME:-}" ]]; then - USER_HOME="${LLM_CONTEXT_HOME}" -elif [[ -n "${1:-}" ]]; then - USER_HOME="$1" -else - # Find the primary non-root user's home (typically jupyter) - if [[ -d "/home/jupyter" ]]; then - USER_HOME="/home/jupyter" - else - USER_HOME="${HOME}" - fi -fi - # Configuration -CONTEXT_DIR="${USER_HOME}/.workbench" +CONTEXT_DIR="${HOME}/.workbench" SKILLS_DIR="${CONTEXT_DIR}/skills" CLAUDE_FILE="${CONTEXT_DIR}/CLAUDE.md" # Visible symlink in home directory for Claude Code auto-discovery -VISIBLE_CLAUDE_SYMLINK="${USER_HOME}/CLAUDE.md" +VISIBLE_CLAUDE_SYMLINK="${HOME}/CLAUDE.md" # Colors for output RED='\033[0;31m' @@ -131,82 +116,10 @@ install_skills() { cat > "${SKILLS_DIR}/CUSTOM_APP.md" << 'SKILL_EOF' # Creating Custom Workbench Apps -## ⚡ Which Approach Do You Need? - -``` -Do you need Jupyter notebooks? -├── YES → Use workbench-jupyter base image (see "Full-Featured" below) -└── NO - └── Do you need Workbench CLI (wb) or gcloud? - ├── YES → Use workbench-tools feature (see "Full-Featured" below) - └── NO → Use MINIMAL PATTERN (this guide) ✅ -``` - -**Most custom apps should use the MINIMAL PATTERN.** It's simpler and less error-prone. - ---- - -## ✅ Pre-Deploy Checklist - -**🤖 LLM INSTRUCTION: Before suggesting deployment, verify ALL items below. Read the user's files and check each one.** - -### Critical (App Won't Start Without These) - -| Check | File | What to Look For | -|-------|------|------------------| -| ✅ Container name | `docker-compose.yaml` | `container_name: "application-server"` | -| ✅ Network config | `docker-compose.yaml` | `networks: [app-network]` and `app-network: {external: true}` | -| ✅ Port exposed | `Dockerfile` | `EXPOSE 8080` (or your port) | -| ✅ Port mapped | `docker-compose.yaml` | `ports: ["8080:8080"]` | -| ✅ HTTP binds correctly | App code | `host='0.0.0.0'` (NOT `localhost` or `127.0.0.1`) | -| ✅ Template exists | `devcontainer-template.json` | Has `id` and `name` fields | - -### Common Mistakes (Check These Too) - -| Check | File | Issue | -|-------|------|-------| -| ⚠️ Valid JSON | `.devcontainer.json` | No trailing commas, no comments in JSON | -| ⚠️ Build context | `docker-compose.yaml` | `context: ../..` if app is in `src/app-name/` | -| ⚠️ Dockerfile path | `docker-compose.yaml` | `dockerfile: src/YOUR-APP/Dockerfile` | -| ⚠️ Long-running process | `Dockerfile` CMD | Must run server, not a script that exits | -| ⚠️ Dependencies installed | `Dockerfile` | All pip/npm packages in requirements | - -### Validation Commands (Run Before Deploy) - -```bash -# 1. Check JSON syntax -python3 -c "import json; json.load(open('.devcontainer.json'))" && echo "✅ Valid JSON" - -# 2. Check docker-compose syntax -docker compose config > /dev/null && echo "✅ Valid docker-compose" - -# 3. Test locally -docker network create app-network 2>/dev/null || true -docker compose build && docker compose up -``` - -### 🤖 LLM Response Template - -When user asks to create/validate an app, respond with: - -``` -## App Validation Results +**Practical guide for creating simple, reliable Workbench apps.** -| Check | Status | Details | -|-------|--------|---------| -| Container name | ✅/❌ | Found: "xxx" | -| Network config | ✅/❌ | ... | -| Port exposed | ✅/❌ | ... | -| ... | ... | ... | - -### Issues Found -1. [Issue description and fix] - -### Ready to Deploy? -[Yes/No and next steps] -``` - ---- +> **When to use this guide:** For simple apps (Flask APIs, static sites, custom tools). +> For apps needing Workbench CLI, gcloud, or Jupyter, see the [full-featured approach](https://github.com/verily-src/workbench-app-devcontainers). ## TL;DR - The Minimal Pattern That Works @@ -469,6 +382,117 @@ Everything else is optional convenience that often breaks. **When in doubt, simplify.** SKILL_EOF + + # Create APP_TEMPLATES.md skill (full version, embedded) + log_info "Creating APP_TEMPLATES.md skill..." + cat > "${SKILLS_DIR}/APP_TEMPLATES.md" << 'TEMPLATES_SKILL_EOF' +# App Templates for Workbench + +**Pre-built, ready-to-deploy application templates with workspace resource integration.** + +> **When to use this:** User wants an app that visualizes data, serves an API, processes files, or creates dashboards using their workspace resources. + +--- + +## Available Templates + +| Template | Best For | Port | Key Features | +|----------|----------|------|--------------| +| **flask-api** | REST APIs, backend services, data processing | 8080 | JSON endpoints, file upload, BQ queries | +| **streamlit-dashboard** | Data visualization, interactive exploration | 8501 | Charts, file browser, BigQuery explorer | +| **rshiny-dashboard** | R statistical analysis, R-based visualizations | 3838 | Shiny UI, plotly, ggplot2, tidyverse | +| **file-processor** | File upload, validation, transformation | 8080 | Drag-drop UI, auto-save to GCS, schema validation | + +--- + +## Template Selection Guide + +### Quick Decision Matrix + +| User Says... | Recommend | +|--------------|-----------| +| "dashboard", "visualize", "charts", "explore data" | `streamlit-dashboard` | +| "API", "endpoint", "backend", "REST", "service" | `flask-api` | +| "R", "statistical", "ggplot", "tidyverse" | `rshiny-dashboard` | +| "upload", "process files", "validate", "CSV" | `file-processor` | +| "something custom", "from scratch" | → Use `CUSTOM_APP.md` skill | + +--- + +## Template Location + +All templates are at: +``` +https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/templates-only/src/templates/ +``` + +--- + +## How to Use a Template + +### Option 1: Deploy Directly +``` +Repository: https://github.com/aculotti-verily/wb-app-mcp-and-context.git +Branch: templates-only +Folder: src/templates/ +``` + +### Option 2: Copy and Customize +1. Copy the template folder to user's repo +2. Modify application code in `app/` +3. Update `devcontainer-template.json` with new name/description +4. Push to GitHub and deploy + +--- + +## Template Summaries + +### flask-api (Port 8080) +- REST API with Flask +- Pre-built endpoints: `/health`, `/resources`, `/buckets//files`, `/bigquery/query` +- Easy to add custom endpoints + +### streamlit-dashboard (Port 8501) +- Interactive dashboard with tabs +- GCS file browser, BigQuery explorer, visualization +- Easy to add new tabs/charts + +### rshiny-dashboard (Port 3838) +- R-based Shiny dashboard +- Includes: shiny, shinydashboard, plotly, ggplot2, dplyr, tidyr +- bigrquery and googleCloudStorageR for data access + +### file-processor (Port 8080) +- Drag-drop file upload UI +- Processes CSV, JSON, Excel +- Auto-save to GCS buckets +- Schema validation + +--- + +## Workspace Resource Integration + +All templates auto-detect workspace resources via environment variables: + +```python +# Python +import os +bucket = os.environ.get("WORKBENCH_my_bucket") +``` + +```r +# R +bucket <- Sys.getenv("WORKBENCH_my_bucket") +``` + +--- + +## When Templates Don't Fit + +If no template matches: +1. Check if a template can be extended (usually yes) +2. If truly custom, read `~/.workbench/skills/CUSTOM_APP.md` +TEMPLATES_SKILL_EOF } # Fetch workspace information @@ -602,25 +626,13 @@ generate_claude_md() { cat > "${CLAUDE_FILE}" << EOF # Workbench Context -You are working inside **Verily Workbench**, a secure cloud-based research environment. - ---- - -## ⚡ Quick Rules (Read This First) - -| If the user asks... | Do this | -|---------------------|---------| -| About the workspace (name, ID, role, description) | **Use this file** → See "Current Workspace" below | -| For a resource path (bucket, dataset) | **Use this file** → See "Resource Paths" below | -| To query data, list files, or run operations | **Use MCP tools** or CLI | - -**Simple rule:** Static info → this file. Actions → MCP/CLI. +You are working inside **Verily Workbench**, a secure cloud-based research environment for biomedical data analysis. --- ## What is Verily Workbench? -Verily Workbench enables researchers to: +Verily Workbench is a platform that enables researchers to: - Access and analyze biomedical data (clinical, genomics, wearables, imaging) - Run computational workflows at scale (WDL, Nextflow) - Collaborate securely with governance and policy enforcement @@ -628,89 +640,21 @@ Verily Workbench enables researchers to: --- -## 📍 Current Workspace - -> **Answer "What workspace am I in?" with this section.** +## Current Workspace | Property | Value | |----------|-------| | **Name** | ${ws_name} | | **ID** | \`${ws_id}\` | -| **Description** | ${ws_desc} | -| **Cloud** | ${ws_cloud} | -| **Project** | \`${project_display}\` | +| **Cloud Platform** | ${ws_cloud} | +| **Project/Account** | \`${project_display}\` | | **Your Role** | ${ws_role} | | **User** | ${ws_user} | | **Organization** | ${ws_org:-"—"} | | **Server** | ${ws_server:-"—"} | -**Example response:** *"You're in **${ws_name}** (\`${ws_id}\`), a ${ws_cloud} workspace where you have ${ws_role} access."* - ---- - -## 🗂️ Resource Paths (Use for "What's the path for X?") - -\`\`\`json -${embedded_json} -\`\`\` - -**How to use:** -- \`resourcePaths["my-bucket"]\` → \`gs://actual-bucket-name\` -- Environment variable: \`\$WORKBENCH_my_bucket\` - ---- - -## ⚠️ Data Persistence Warning - -> **LOCAL FILES ARE LOST WHEN THE APP STOPS.** Always save important work to cloud buckets. - -### Available Buckets -${bucket_list} - -### Quick Save Commands -\`\`\`bash -gsutil cp file.ipynb gs://BUCKET/notebooks/ # Single file -gsutil -m cp -r ./results/ gs://BUCKET/results/ # Directory -\`\`\` - -**🤖 Proactively ask users:** *"Want me to save this to a bucket so it persists?"* - ---- - -## 🔍 Data Exploration (Most Common Tasks) - -### Find Resources -\`\`\`bash -wb resource list # List all -wb resource describe # Details -env | grep WORKBENCH_ # Environment variables -\`\`\` - -### Preview BigQuery Data -\`\`\`bash -bq ls PROJECT:DATASET # List tables -bq show --schema PROJECT:DATASET.TABLE # Schema -bq head -n 10 PROJECT:DATASET.TABLE # Sample rows -\`\`\` - -### Browse GCS Files -\`\`\`bash -gsutil ls gs://BUCKET/ # List -gsutil cat gs://BUCKET/file.txt | head # Preview -\`\`\` - ---- - -## 🔧 MCP Tools vs CLI - -| Use MCP Tools For | Use CLI For | -|-------------------|-------------| -| \`list_resources\`, \`get_resource\` | Complex operations | -| \`query_bigquery\` | \`wb workflow logs\` | -| \`run_workflow\` | \`wb resource delete\` | -| Structured responses | Full feature coverage | - -**Prefer MCP when available** — it's faster and returns structured data. +### Description +${ws_desc} --- @@ -834,20 +778,100 @@ gs://your-bucket/ --- -## Python Examples +## 🔍 Data Exploration Cheatsheet + +This is the **most important section** for quickly discovering and accessing data. + +### Step 1: Find Your Resources +\`\`\`bash +wb resource list --format=json | jq '.[] | {name: .id, type: .resourceType}' +\`\`\` + +### Step 2: Use Environment Variables (Easiest!) +Every resource is available as an environment variable: +\`\`\`bash +# Pattern: \$WORKBENCH_ +echo \$WORKBENCH_my_bucket # → gs://actual-bucket-name +env | grep WORKBENCH_ # List all +\`\`\` + +### Step 3: Get Cloud Paths +\`\`\`bash +wb resource describe --format=json +# Look for: bucketName, projectId, datasetId, gitRepoUrl +\`\`\` + +### Step 4: Preview Data Quickly + +**BigQuery:** +\`\`\`bash +bq head -n 10 :.
# Quick preview +bq show --schema :.
# Column names/types +bq show --format=prettyjson :.
| jq '{rows: .numRows}' # Row count +\`\`\` + +**GCS:** +\`\`\`bash +gsutil ls gs:/// # List files +gsutil cat -r 0-1024 gs:///file.csv # Preview first 1KB +\`\`\` + +### 🤖 LLM Quick Patterns + +| Question | Command | +|----------|---------| +| "What data is available?" | \`wb resource list\` | +| "What tables in dataset?" | \`bq ls :\` | +| "What columns in table?" | \`bq show --schema :.
\` | +| "How big is this table?" | \`bq show --format=prettyjson ... \\| jq '{rows: .numRows}'\` | +| "Show sample data" | \`bq head -n 5 :.
\` | + +--- + +## How to Discover Data (Detailed) + +### List Resources +\`\`\`bash +wb resource list +wb resource list --format=json +wb resource describe +\`\`\` + +### Explore GCS Buckets +\`\`\`bash +gsutil ls gs:/// +gsutil ls -l gs:///path/ +gsutil cat gs:///path/file.txt +\`\`\` + +### Explore BigQuery +\`\`\`bash +bq ls : +bq show :.
+bq query --use_legacy_sql=false 'SELECT * FROM \`project.dataset.table\` LIMIT 10' +\`\`\` + +--- + +## How to Query Data +### BigQuery (CLI) +\`\`\`bash +bq query --use_legacy_sql=false 'SELECT * FROM \`project.dataset.table\` LIMIT 100' +\`\`\` + +### BigQuery (Python) \`\`\`python -# BigQuery from google.cloud import bigquery client = bigquery.Client() df = client.query("SELECT * FROM \\\`project.dataset.table\\\` LIMIT 100").to_dataframe() +\`\`\` -# GCS Files +### GCS Files (Python) +\`\`\`python import pandas as pd df = pd.read_parquet('gs://bucket/path/file.parquet') - -# Save to GCS -df.to_parquet('gs://bucket/output.parquet') +df = pd.read_csv('gs://bucket/path/file.csv') \`\`\` --- @@ -885,6 +909,33 @@ wb resource add-ref gcs-bucket --name external-data --bucket-name existing-bucke --- +## MCP vs CLI: When to Use Each + +This app has **two interfaces** to Workbench functionality: + +| Interface | Best For | Pros | Cons | +|-----------|----------|------|------| +| **MCP Tools** | LLM operations | Structured responses, no shell needed, faster | Limited tool set | +| **CLI (\`wb\`)** | Complex operations, fallback | Full feature coverage, human-friendly | Requires shell execution, text parsing | + +### 🤖 LLM Decision Guide + +1. **Prefer MCP tools** when the operation is supported — they return structured data and don't require shell execution +2. **Fall back to CLI** when MCP doesn't have the tool, or for complex/chained operations +3. **Use cloud CLIs directly** (\`gsutil\`, \`bq\`, \`gcloud\`) for low-level cloud operations + +### Example: Same Operation, Two Ways + +**List resources:** +- MCP: Use \`list_resources\` tool → returns JSON array +- CLI: Run \`wb resource list --format=json\` → parse stdout + +**Query BigQuery:** +- MCP: Use \`query_bigquery\` tool with SQL parameter → returns results +- CLI: Run \`bq query --use_legacy_sql=false 'SELECT ...'\` → parse output + +--- + ## MCP Tools Available The Workbench MCP server exposes these tools for programmatic LLM access: @@ -954,21 +1005,39 @@ wb app describe # App details ## Creating Custom Apps -**Two approaches depending on complexity:** +> **IMPORTANT: When a user asks to create an app, turn code into an app, or build something deployable, follow this decision process:** -### Simple Apps (Recommended) -Workbench custom apps need exactly **three things**: -1. Container named \`application-server\` -2. Connected to \`app-network\` (external Docker network) -3. HTTP server on a port +### Step 1: Check Against Templates First -⚠️ **Avoid complexity:** Devcontainer features and startup scripts often fail. +**Read \`~/.workbench/skills/APP_TEMPLATES.md\`** and ask: +- Does a pre-built template match their needs? +- Can a template be easily extended? -**📖 For detailed guide:** \`Read ~/.workbench/skills/CUSTOM_APP.md\` +| User's Goal | Recommended Template | +|-------------|---------------------| +| REST API, backend service | \`flask-api\` | +| Data dashboard, visualization | \`streamlit-dashboard\` | +| R analysis, statistical work | \`rshiny-dashboard\` | +| File upload, processing | \`file-processor\` | -### Full-Featured Apps -For apps needing Workbench CLI, gcloud, etc.: -📦 https://github.com/verily-src/workbench-app-devcontainers +### Step 2: If No Template Fits + +**Read \`~/.workbench/skills/CUSTOM_APP.md\`** for: +- Building from scratch +- Minimal working pattern +- Common pitfalls to avoid + +### Step 3: Present Options to User + +Always explain: +1. **Template option**: "There's a pre-built X template that does Y. We can customize it." +2. **From-scratch option**: "Or we can build something custom from the ground up." + +Let the user decide based on their specific needs. + +### Quick Reference +- **Templates**: https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/templates-only/src/templates/ +- **Full-featured apps**: https://github.com/verily-src/workbench-app-devcontainers --- @@ -976,11 +1045,31 @@ For apps needing Workbench CLI, gcloud, etc.: When users ask about specific topics, **read these skill files** for detailed guidance: -| Topic | Skill File | -|-------|------------| -| Creating custom apps | \`~/.workbench/skills/CUSTOM_APP.md\` | +| Topic | Skill File | When to Use | +|-------|------------|-------------| +| Pre-built app templates | \`~/.workbench/skills/APP_TEMPLATES.md\` | User wants dashboard, API, file processor | +| Building apps from scratch | \`~/.workbench/skills/CUSTOM_APP.md\` | User needs full control or custom solution | + +**Always read BOTH skills when app creation comes up**, then recommend the best approach. -**How to use:** When the topic comes up, read the skill file first. +--- + +## Quick Reference (Machine-Readable) + +Use this JSON for exact resource paths and environment variables: + +\`\`\`json +${embedded_json} +\`\`\` + +**Usage:** +- \`resourcePaths["my-bucket"]\` → exact GCS/BQ path +- \`envVars["WORKBENCH_my_bucket"]\` → environment variable value + +To refresh after workspace changes: +\`\`\`bash +~/.workbench/generate-context.sh +\`\`\` --- diff --git a/features/src/llm-context/skills/APP_TEMPLATES.md b/features/src/llm-context/skills/APP_TEMPLATES.md new file mode 100644 index 000000000..1cb04722c --- /dev/null +++ b/features/src/llm-context/skills/APP_TEMPLATES.md @@ -0,0 +1,261 @@ +# App Templates for Workbench + +**Pre-built, ready-to-deploy application templates with workspace resource integration.** + +> **When to use this:** User wants an app that visualizes data, serves an API, processes files, or creates dashboards using their workspace resources. + +--- + +## Available Templates + +| Template | Best For | Port | Key Features | +|----------|----------|------|--------------| +| **flask-api** | REST APIs, backend services, data processing | 8080 | JSON endpoints, file upload, BQ queries | +| **streamlit-dashboard** | Data visualization, interactive exploration | 8501 | Charts, file browser, BigQuery explorer | +| **rshiny-dashboard** | R statistical analysis, R-based visualizations | 3838 | Shiny UI, plotly, ggplot2, tidyverse | +| **file-processor** | File upload, validation, transformation | 8080 | Drag-drop UI, auto-save to GCS, schema validation | + +--- + +## Template Selection Guide + +### Ask the user these questions: + +1. **What language/framework preference?** + - Python → `flask-api`, `streamlit-dashboard`, `file-processor` + - R → `rshiny-dashboard` + +2. **What's the primary purpose?** + - API/Backend service → `flask-api` + - Interactive dashboard → `streamlit-dashboard` or `rshiny-dashboard` + - Process/upload files → `file-processor` + +3. **What workspace resources do they need?** + - All templates support GCS buckets and BigQuery + +### Quick Decision Matrix + +| User Says... | Recommend | +|--------------|-----------| +| "dashboard", "visualize", "charts", "explore data" | `streamlit-dashboard` | +| "API", "endpoint", "backend", "REST", "service" | `flask-api` | +| "R", "statistical", "ggplot", "tidyverse" | `rshiny-dashboard` | +| "upload", "process files", "validate", "CSV" | `file-processor` | +| "something custom", "from scratch" | → Use `CUSTOM_APP.md` skill | + +--- + +## Template Locations + +All templates are in: +``` +https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/templates-only/src/templates/ +``` + +Each template contains: +- `manifest.yaml` - Capabilities and inputs +- `.devcontainer.json` - Devcontainer config +- `docker-compose.yaml` - Container setup +- `Dockerfile` - Build instructions +- `app/` - Application code +- `README.md` - Documentation + +--- + +## How to Use a Template + +### Option 1: Deploy Directly +``` +Repository: https://github.com/aculotti-verily/wb-app-mcp-and-context.git +Branch: templates-only +Folder: src/templates/ +``` + +### Option 2: Copy and Customize +1. Copy the template folder to user's repo +2. Modify application code in `app/` +3. Update `devcontainer-template.json` with new name/description +4. Push to GitHub +5. Deploy from user's repo + +--- + +## Template Details + +### 1. Flask API (`flask-api`) + +**Capabilities:** REST API, JSON, file upload, BigQuery, GCS + +**Pre-built endpoints:** +- `GET /health` - Health check +- `GET /resources` - List workspace resources +- `GET /buckets//files` - List bucket files +- `POST /buckets//upload` - Upload to bucket +- `POST /bigquery/query` - Run BQ query +- `GET /bigquery/tables/` - List tables +- `POST /process` - Custom processing (user extends this) + +**Customization points:** +- Add endpoints in `app/main.py` +- Add dependencies in `app/requirements.txt` + +--- + +### 2. Streamlit Dashboard (`streamlit-dashboard`) + +**Capabilities:** Interactive UI, charts, data exploration, BigQuery, GCS + +**Pre-built features:** +- GCS file browser with CSV preview +- BigQuery query interface +- Data visualization (line, bar, scatter) +- Workspace resource sidebar + +**Customization points:** +- Add tabs/pages in `app/main.py` +- Add visualizations with plotly/altair +- Add additional data sources + +--- + +### 3. RShiny Dashboard (`rshiny-dashboard`) + +**Capabilities:** R analysis, Shiny UI, plotly, statistical visualization + +**Pre-built features:** +- Dashboard layout with shinydashboard +- Data upload and exploration +- Interactive charts with plotly +- Workspace resource viewer + +**R packages included:** +- shiny, shinydashboard, DT +- plotly, ggplot2 +- dplyr, tidyr +- bigrquery, googleCloudStorageR + +**Customization points:** +- Modify UI in `app/app.R` +- Add R packages in Dockerfile +- Add statistical analysis functions + +--- + +### 4. File Processor (`file-processor`) + +**Capabilities:** File upload, validation, transformation, GCS storage + +**Pre-built features:** +- Drag-and-drop upload UI +- CSV, JSON, Excel processing +- Auto-save to GCS bucket +- Schema validation endpoint + +**Supported formats:** +- CSV → Row/column analysis, schema detection +- JSON → Structure analysis, schema validation +- Excel → Sheet parsing, data extraction + +**Customization points:** +- Add processing logic in `app/main.py` +- Add validation schemas +- Add transformation pipelines + +--- + +## Workspace Resource Integration + +All templates automatically detect workspace resources: + +### Python Templates +```python +import os + +# All resources as dict +resources = { + k.replace("WORKBENCH_", ""): v + for k, v in os.environ.items() + if k.startswith("WORKBENCH_") +} + +# Specific resource +bucket = os.environ.get("WORKBENCH_my_bucket") +``` + +### R Template +```r +# All resources +resources <- Sys.getenv() +wb_vars <- resources[grepl("^WORKBENCH_", names(resources))] + +# Specific resource +bucket <- Sys.getenv("WORKBENCH_my_bucket") +``` + +--- + +## When Templates Don't Fit + +If the user's requirements don't match any template: + +1. **Check if a template can be extended** + - Most templates are customizable + - Adding endpoints to flask-api is easy + - Adding tabs to streamlit is easy + +2. **If truly custom, use CUSTOM_APP.md skill** + - Minimal from-scratch pattern + - Avoid common pitfalls + - Full control over everything + +--- + +## Common Customizations + +### Add a new endpoint (Flask) +```python +@app.route("/my-endpoint", methods=["POST"]) +def my_endpoint(): + data = request.get_json() + # Your logic here + return jsonify({"result": "success"}) +``` + +### Add a new tab (Streamlit) +```python +tab1, tab2, tab3, tab4 = st.tabs(["Existing", "Tabs", "Here", "New Tab"]) + +with tab4: + st.header("My New Feature") + # Your code here +``` + +### Add R packages (RShiny) +```dockerfile +# In Dockerfile, add to install.packages(): +RUN R -e "install.packages(c('existingpkgs', 'newpackage'))" +``` + +--- + +## Deployment Checklist + +Before deploying any template: + +- [ ] Container name is `application-server` +- [ ] Network is `app-network` with `external: true` +- [ ] Port is exposed and mapped correctly +- [ ] `devcontainer-template.json` has unique `id` +- [ ] Application binds to `0.0.0.0` (not `localhost`) + +--- + +## Summary + +| Need | Template | Customization Effort | +|------|----------|---------------------| +| Quick API | flask-api | Low - add endpoints | +| Data dashboard | streamlit-dashboard | Low - add tabs | +| R analysis | rshiny-dashboard | Low - modify app.R | +| File processing | file-processor | Low - add processors | +| Something else | CUSTOM_APP.md | Medium - from scratch | diff --git a/features/src/llm-context/templates/README.md b/features/src/llm-context/templates/README.md new file mode 100644 index 000000000..eaec81d60 --- /dev/null +++ b/features/src/llm-context/templates/README.md @@ -0,0 +1,100 @@ +# Workbench App Templates + +Pre-built application templates for Verily Workbench with workspace resource integration. + +## Available Templates + +| Template | Description | Port | Complexity | +|----------|-------------|------|------------| +| [flask-api](./flask-api/) | REST API with Flask for data processing | 8080 | Simple | +| [streamlit-dashboard](./streamlit-dashboard/) | Interactive data dashboard with Streamlit | 8501 | Simple | +| [rshiny-dashboard](./rshiny-dashboard/) | R-based interactive dashboard with Shiny | 3838 | Simple | +| [file-processor](./file-processor/) | File upload, validation, and GCS storage | 8080 | Simple | + +## Features + +All templates include: + +- ✅ **Workspace Integration**: Auto-discovery of GCS buckets and BigQuery datasets +- ✅ **Environment Variables**: `WORKBENCH_` for all resources +- ✅ **LLM Context**: Compatible with `llm-context` feature for Claude/Gemini +- ✅ **Standard Structure**: Consistent devcontainer configuration +- ✅ **Documentation**: README with usage examples + +## Quick Start + +1. Choose a template that matches your use case +2. Copy the template folder to your repository +3. Customize the application code +4. Deploy to Workbench + +## Template Structure + +Each template follows this structure: + +``` +template-name/ +├── manifest.yaml # Template metadata & capabilities +├── devcontainer-template.json # Workbench UI registration +├── .devcontainer.json # Devcontainer configuration +├── docker-compose.yaml # Container setup +├── Dockerfile # Build instructions +├── app/ # Application code +│ ├── main.py (or app.R) +│ └── requirements.txt +└── README.md # Usage documentation +``` + +## Workspace Resource Access + +### Python + +```python +import os + +# Get all workspace resources +resources = { + k.replace("WORKBENCH_", ""): v + for k, v in os.environ.items() + if k.startswith("WORKBENCH_") +} + +# Access specific resource +bucket_path = os.environ.get("WORKBENCH_my_bucket") +``` + +### R + +```r +# Get all workspace resources +resources <- Sys.getenv() +workbench_vars <- resources[grepl("^WORKBENCH_", names(resources))] + +# Access specific resource +bucket_path <- Sys.getenv("WORKBENCH_my_bucket") +``` + +## Customization + +1. **Add Dependencies**: Edit `requirements.txt` (Python) or `Dockerfile` (R packages) +2. **Change Port**: Update `docker-compose.yaml` and `.devcontainer.json` +3. **Add Features**: Include additional devcontainer features in `.devcontainer.json` + +## Deployment + +### Via Workbench UI + +1. Push your customized template to a GitHub repository +2. In Workbench, create a new app → Custom App +3. Enter repository URL, branch, and folder path +4. Launch the app + +### Template Manifest + +Each template includes a `manifest.yaml` with: +- **capabilities**: What the template can do +- **inputs**: Configuration options +- **complexity**: Simple, Medium, or Advanced +- **port**: Default exposed port + +This manifest can be used by LLMs to select appropriate templates based on user requirements. diff --git a/features/src/llm-context/templates/file-processor/.devcontainer.json b/features/src/llm-context/templates/file-processor/.devcontainer.json new file mode 100644 index 000000000..123061e79 --- /dev/null +++ b/features/src/llm-context/templates/file-processor/.devcontainer.json @@ -0,0 +1,35 @@ +{ + "name": "File Processor", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": [ + "bash", "-c", + "./startupscript/post-startup.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" + ], + "postStartCommand": [ + "bash", "-c", + "./startupscript/remount-on-restart.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" + ], + "features": { + "./.devcontainer/features/workbench-tools": { + "cloud": "${templateOption:cloud}", + "username": "appuser", + "userHomeDir": "/home/appuser" + }, + "./.devcontainer/features/llm-context": { + "username": "appuser", + "userHomeDir": "/home/appuser" + } + }, + "remoteUser": "root", + "customizations": { + "workbench": { + "proxyTargetPort": 8080, + "opens": { + "extensions": [".py", ".json", ".yaml", ".md", ".csv"] + } + } + } +} diff --git a/features/src/llm-context/templates/file-processor/Dockerfile b/features/src/llm-context/templates/file-processor/Dockerfile new file mode 100644 index 000000000..de0660167 --- /dev/null +++ b/features/src/llm-context/templates/file-processor/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.11-slim + +RUN groupadd -r appuser && useradd -r -g appuser -d /home/appuser -m appuser + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl fuse \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Create directories for file processing +RUN mkdir -p /app/uploads /app/processed /app/schemas && \ + chown -R appuser:appuser /app + +COPY app/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY app/ . +RUN chown -R appuser:appuser /app /home/appuser + +EXPOSE 8080 +USER appuser + +CMD ["python", "main.py"] diff --git a/features/src/llm-context/templates/file-processor/README.md b/features/src/llm-context/templates/file-processor/README.md new file mode 100644 index 000000000..7add92064 --- /dev/null +++ b/features/src/llm-context/templates/file-processor/README.md @@ -0,0 +1,64 @@ +# File Processor Template + +A file upload and processing template for Verily Workbench with GCS integration. + +## Features + +- **Drag & Drop Upload**: Easy file upload interface +- **Multi-format Support**: CSV, JSON, Excel files +- **Auto-processing**: Extracts metadata, row counts, column info +- **GCS Integration**: Save processed files to workspace buckets +- **Schema Validation**: Validate JSON against schemas + +## Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/` | GET | Web UI for file upload | +| `/health` | GET | Health check | +| `/buckets` | GET | List workspace buckets | +| `/upload` | POST | Upload and process file | +| `/validate` | POST | Validate file against schema | + +## Supported File Types + +| Type | Extensions | Processing | +|------|------------|------------| +| CSV | `.csv` | Row/column counts, schema, null detection | +| JSON | `.json` | Type detection, key enumeration | +| Excel | `.xlsx`, `.xls` | Row/column counts, schema | + +## Customization + +1. Edit `app/main.py` to add processing logic +2. Update `app/requirements.txt` for additional libraries +3. Add validation schemas to `/app/schemas/` + +## Local Testing + +```bash +cd app && pip install -r requirements.txt && python main.py +``` + +Open http://localhost:8080 in your browser. + +## Workspace Resources + +Workspace buckets are auto-discovered: +- `WORKBENCH_` environment variables +- Displayed in the web UI sidebar +- Used for automatic file storage + +## API Usage + +```bash +# Upload a file +curl -X POST http://localhost:8080/upload \ + -F "file=@data.csv" \ + -F "save_to_gcs=true" + +# Validate JSON against schema +curl -X POST http://localhost:8080/validate \ + -F "file=@data.json" \ + -F 'schema={"type": "object", "required": ["id", "name"]}' +``` diff --git a/features/src/llm-context/templates/file-processor/app/main.py b/features/src/llm-context/templates/file-processor/app/main.py new file mode 100644 index 000000000..660a22622 --- /dev/null +++ b/features/src/llm-context/templates/file-processor/app/main.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +""" +File Processor Template for Verily Workbench + +Upload, validate, transform, and store files with GCS integration. +""" + +import os +import json +import uuid +from datetime import datetime +from pathlib import Path + +from flask import Flask, request, jsonify, render_template_string +from google.cloud import storage +import pandas as pd +from jsonschema import validate, ValidationError + +app = Flask(__name__) + +# Configuration +UPLOAD_FOLDER = Path("/app/uploads") +PROCESSED_FOLDER = Path("/app/processed") +SCHEMAS_FOLDER = Path("/app/schemas") +MAX_CONTENT_LENGTH = 100 * 1024 * 1024 # 100MB + +app.config["MAX_CONTENT_LENGTH"] = MAX_CONTENT_LENGTH + +# ============================================================================= +# HTML TEMPLATE +# ============================================================================= + +HTML_TEMPLATE = """ + + + + File Processor + + + +

📁 File Processor

+

Upload, validate, transform, and store files in your Workbench buckets

+ +
+
+
+

📤 Drag & drop a file here, or click to select

+ +

+
+ + + + + +
+ + + +
+

📦 Available Workspace Buckets

+
Loading...
+
+ + + + +""" + +# ============================================================================= +# WORKSPACE HELPERS +# ============================================================================= + +def get_workspace_buckets(): + """Get GCS bucket paths from workspace environment.""" + return { + k.replace("WORKBENCH_", ""): v + for k, v in os.environ.items() + if k.startswith("WORKBENCH_") and v.startswith("gs://") + } + + +def get_gcs_client(): + return storage.Client() + + +def upload_to_gcs(local_path: Path, bucket_name: str, blob_name: str): + """Upload a file to GCS.""" + client = get_gcs_client() + bucket = client.bucket(bucket_name.replace("gs://", "")) + blob = bucket.blob(blob_name) + blob.upload_from_filename(str(local_path)) + return f"gs://{bucket.name}/{blob_name}" + +# ============================================================================= +# PROCESSING FUNCTIONS +# ============================================================================= + +def process_csv(file_path: Path) -> dict: + """Process and validate CSV file.""" + df = pd.read_csv(file_path) + return { + "rows": len(df), + "columns": len(df.columns), + "column_names": list(df.columns), + "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}, + "null_counts": df.isnull().sum().to_dict(), + "sample": df.head(5).to_dict(orient="records") + } + + +def process_json(file_path: Path) -> dict: + """Process and validate JSON file.""" + with open(file_path) as f: + data = json.load(f) + + if isinstance(data, list): + return { + "type": "array", + "length": len(data), + "sample": data[:5] if len(data) > 5 else data + } + else: + return { + "type": "object", + "keys": list(data.keys()), + "sample": data + } + + +def process_excel(file_path: Path) -> dict: + """Process Excel file.""" + df = pd.read_excel(file_path) + return { + "rows": len(df), + "columns": len(df.columns), + "column_names": list(df.columns), + "sample": df.head(5).to_dict(orient="records") + } + +# ============================================================================= +# ROUTES +# ============================================================================= + +@app.route("/") +def index(): + return render_template_string(HTML_TEMPLATE) + + +@app.route("/health") +def health(): + return jsonify({"status": "healthy"}) + + +@app.route("/buckets") +def list_buckets(): + """List available workspace buckets.""" + return jsonify(get_workspace_buckets()) + + +@app.route("/upload", methods=["POST"]) +def upload_file(): + """Upload and process a file.""" + if "file" not in request.files: + return jsonify({"error": "No file provided"}), 400 + + file = request.files["file"] + if file.filename == "": + return jsonify({"error": "No file selected"}), 400 + + # Save uploaded file + file_id = str(uuid.uuid4())[:8] + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"{timestamp}_{file_id}_{file.filename}" + file_path = UPLOAD_FOLDER / filename + file.save(file_path) + + try: + # Process based on file type + suffix = Path(file.filename).suffix.lower() + + if suffix == ".csv": + result = process_csv(file_path) + elif suffix == ".json": + result = process_json(file_path) + elif suffix in [".xlsx", ".xls"]: + result = process_excel(file_path) + else: + return jsonify({"error": f"Unsupported file type: {suffix}"}), 400 + + result["message"] = f"Successfully processed {file.filename}" + result["filename"] = filename + + # Optionally save to GCS + save_to_gcs = request.form.get("save_to_gcs", "false").lower() == "true" + if save_to_gcs: + buckets = get_workspace_buckets() + if buckets: + # Use first available bucket + bucket_name = list(buckets.values())[0] + blob_name = f"processed/{filename}" + gcs_path = upload_to_gcs(file_path, bucket_name, blob_name) + result["gcs_path"] = gcs_path + else: + result["warning"] = "No GCS buckets found in workspace" + + return jsonify(result) + + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +@app.route("/validate", methods=["POST"]) +def validate_file(): + """Validate file against a JSON schema.""" + if "file" not in request.files: + return jsonify({"error": "No file provided"}), 400 + + if "schema" not in request.form: + return jsonify({"error": "No schema provided"}), 400 + + file = request.files["file"] + schema = json.loads(request.form["schema"]) + + try: + data = json.load(file) + validate(instance=data, schema=schema) + return jsonify({"valid": True, "message": "Validation passed"}) + except ValidationError as e: + return jsonify({"valid": False, "error": str(e.message)}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +# ============================================================================= +# MAIN +# ============================================================================= + +if __name__ == "__main__": + UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True) + PROCESSED_FOLDER.mkdir(parents=True, exist_ok=True) + + port = int(os.environ.get("PORT", 8080)) + app.run(host="0.0.0.0", port=port, debug=False) diff --git a/features/src/llm-context/templates/file-processor/app/requirements.txt b/features/src/llm-context/templates/file-processor/app/requirements.txt new file mode 100644 index 000000000..314a27c6e --- /dev/null +++ b/features/src/llm-context/templates/file-processor/app/requirements.txt @@ -0,0 +1,8 @@ +flask==3.0.0 +gunicorn==21.2.0 +google-cloud-storage==2.14.0 +google-cloud-bigquery==3.14.0 +pandas==2.1.4 +jsonschema==4.20.0 +pyarrow==14.0.2 +openpyxl==3.1.2 diff --git a/features/src/llm-context/templates/file-processor/devcontainer-template.json b/features/src/llm-context/templates/file-processor/devcontainer-template.json new file mode 100644 index 000000000..b84fca2bd --- /dev/null +++ b/features/src/llm-context/templates/file-processor/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "file-processor", + "version": "1.0.0", + "name": "File Processor", + "description": "Upload, validate, and transform files with GCS integration", + "documentationURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/master/src/templates/file-processor", + "licenseURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/features/src/llm-context/templates/file-processor/docker-compose.yaml b/features/src/llm-context/templates/file-processor/docker-compose.yaml new file mode 100644 index 000000000..505717e81 --- /dev/null +++ b/features/src/llm-context/templates/file-processor/docker-compose.yaml @@ -0,0 +1,32 @@ +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + volumes: + - .:/workspace:cached + - uploads:/app/uploads + - processed:/app/processed + ports: + - "8080:8080" + environment: + - MAX_UPLOAD_SIZE=100MB + - PYTHONUNBUFFERED=1 + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + +volumes: + uploads: + processed: + +networks: + app-network: + external: true diff --git a/features/src/llm-context/templates/file-processor/manifest.yaml b/features/src/llm-context/templates/file-processor/manifest.yaml new file mode 100644 index 000000000..7246d50a5 --- /dev/null +++ b/features/src/llm-context/templates/file-processor/manifest.yaml @@ -0,0 +1,33 @@ +id: file-processor +name: File Processor +description: Upload, validate, and transform files with GCS integration +version: 1.0.0 + +capabilities: + - file-upload + - file-validation + - data-transformation + - gcs-access + - csv-processing + - json-processing + +inputs: + - name: app_name + type: string + required: true + default: "file-processor" + + - name: validation_schema + type: object + required: false + description: JSON schema for file validation + + - name: output_bucket + type: resource + resource_type: GCS_BUCKET + required: false + description: Bucket to store processed files + +complexity: simple +estimated_build_time: 3min +port: 8080 diff --git a/features/src/llm-context/templates/flask-api/.devcontainer.json b/features/src/llm-context/templates/flask-api/.devcontainer.json new file mode 100644 index 000000000..484a56c6a --- /dev/null +++ b/features/src/llm-context/templates/flask-api/.devcontainer.json @@ -0,0 +1,35 @@ +{ + "name": "Flask REST API", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": [ + "bash", "-c", + "./startupscript/post-startup.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" + ], + "postStartCommand": [ + "bash", "-c", + "./startupscript/remount-on-restart.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" + ], + "features": { + "./.devcontainer/features/workbench-tools": { + "cloud": "${templateOption:cloud}", + "username": "appuser", + "userHomeDir": "/home/appuser" + }, + "./.devcontainer/features/llm-context": { + "username": "appuser", + "userHomeDir": "/home/appuser" + } + }, + "remoteUser": "root", + "customizations": { + "workbench": { + "proxyTargetPort": 8080, + "opens": { + "extensions": [".py", ".json", ".yaml", ".md"] + } + } + } +} diff --git a/features/src/llm-context/templates/flask-api/Dockerfile b/features/src/llm-context/templates/flask-api/Dockerfile new file mode 100644 index 000000000..5571806ca --- /dev/null +++ b/features/src/llm-context/templates/flask-api/Dockerfile @@ -0,0 +1,32 @@ +FROM python:3.11-slim + +# Create non-root user +RUN groupadd -r appuser && useradd -r -g appuser -d /home/appuser -m appuser + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + fuse \ + && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Copy requirements first for caching +COPY app/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY app/ . + +# Set ownership +RUN chown -R appuser:appuser /app /home/appuser + +# Expose port +EXPOSE 8080 + +# Run as non-root user +USER appuser + +# Start application +CMD ["python", "main.py"] diff --git a/features/src/llm-context/templates/flask-api/README.md b/features/src/llm-context/templates/flask-api/README.md new file mode 100644 index 000000000..6e1b5cf1a --- /dev/null +++ b/features/src/llm-context/templates/flask-api/README.md @@ -0,0 +1,48 @@ +# Flask REST API Template + +A REST API template for Verily Workbench with built-in support for GCS and BigQuery. + +## Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/health` | GET | Health check | +| `/resources` | GET | List workspace resources | +| `/buckets//files` | GET | List files in bucket | +| `/buckets//upload` | POST | Upload file to bucket | +| `/bigquery/query` | POST | Run BigQuery query | +| `/bigquery/tables/` | GET | List tables in dataset | +| `/process` | POST | Custom processing endpoint | + +## Customization + +1. Edit `app/main.py` to add your endpoints +2. Update `app/requirements.txt` for additional dependencies +3. Modify `docker-compose.yaml` for environment variables + +## Local Testing + +```bash +cd app && pip install -r requirements.txt && python main.py +``` + +## Workspace Resources + +Access workspace buckets and datasets via environment variables: +- `WORKBENCH_` contains the resource path +- Use `GET /resources` to see all available resources + +## Example Usage + +```bash +# Check health +curl http://localhost:8080/health + +# List resources +curl http://localhost:8080/resources + +# Query BigQuery +curl -X POST http://localhost:8080/bigquery/query \ + -H "Content-Type: application/json" \ + -d '{"query": "SELECT * FROM `project.dataset.table` LIMIT 10"}' +``` diff --git a/features/src/llm-context/templates/flask-api/app/main.py b/features/src/llm-context/templates/flask-api/app/main.py new file mode 100644 index 000000000..e0a9ab528 --- /dev/null +++ b/features/src/llm-context/templates/flask-api/app/main.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +""" +Flask REST API Template for Verily Workbench + +This template provides a starting point for building REST APIs that +integrate with workspace resources (GCS buckets, BigQuery tables). +""" + +import os +import json +from flask import Flask, request, jsonify +from google.cloud import storage, bigquery + +app = Flask(__name__) + +# ============================================================================= +# WORKSPACE RESOURCE HELPERS +# ============================================================================= + +def get_workspace_resources(): + """ + Get workspace resources from environment variables. + + Workbench automatically sets WORKBENCH_ environment variables + for each resource in the workspace. + """ + resources = {} + for key, value in os.environ.items(): + if key.startswith("WORKBENCH_"): + resource_name = key.replace("WORKBENCH_", "").lower() + resources[resource_name] = value + return resources + + +def get_bucket_client(): + """Get a GCS client for workspace buckets.""" + return storage.Client() + + +def get_bigquery_client(): + """Get a BigQuery client for workspace datasets.""" + return bigquery.Client() + + +# ============================================================================= +# API ENDPOINTS +# ============================================================================= + +@app.route("/health", methods=["GET"]) +def health(): + """Health check endpoint.""" + return jsonify({ + "status": "healthy", + "service": "flask-api" + }) + + +@app.route("/resources", methods=["GET"]) +def list_resources(): + """List all workspace resources available to this app.""" + return jsonify({ + "resources": get_workspace_resources() + }) + + +@app.route("/buckets//files", methods=["GET"]) +def list_bucket_files(bucket_name: str): + """ + List files in a workspace bucket. + + Example: GET /buckets/my-bucket/files + """ + try: + # Remove gs:// prefix if present + bucket_name = bucket_name.replace("gs://", "") + + client = get_bucket_client() + bucket = client.bucket(bucket_name) + + prefix = request.args.get("prefix", "") + blobs = bucket.list_blobs(prefix=prefix) + + files = [{"name": blob.name, "size": blob.size} for blob in blobs] + + return jsonify({ + "bucket": bucket_name, + "files": files, + "count": len(files) + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +@app.route("/buckets//upload", methods=["POST"]) +def upload_file(bucket_name: str): + """ + Upload a file to a workspace bucket. + + Example: POST /buckets/my-bucket/upload + Body: multipart/form-data with 'file' field + """ + try: + if "file" not in request.files: + return jsonify({"error": "No file provided"}), 400 + + file = request.files["file"] + dest_path = request.form.get("path", file.filename) + + bucket_name = bucket_name.replace("gs://", "") + client = get_bucket_client() + bucket = client.bucket(bucket_name) + blob = bucket.blob(dest_path) + + blob.upload_from_file(file) + + return jsonify({ + "success": True, + "path": f"gs://{bucket_name}/{dest_path}" + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +@app.route("/bigquery/query", methods=["POST"]) +def run_query(): + """ + Run a BigQuery query. + + Example: POST /bigquery/query + Body: {"query": "SELECT * FROM `project.dataset.table` LIMIT 10"} + """ + try: + data = request.get_json() + query = data.get("query") + + if not query: + return jsonify({"error": "No query provided"}), 400 + + client = get_bigquery_client() + result = client.query(query).to_dataframe() + + return jsonify({ + "columns": list(result.columns), + "rows": result.to_dict(orient="records"), + "count": len(result) + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +@app.route("/bigquery/tables/", methods=["GET"]) +def list_tables(dataset: str): + """ + List tables in a BigQuery dataset. + + Example: GET /bigquery/tables/my-project.my-dataset + """ + try: + client = get_bigquery_client() + tables = client.list_tables(dataset) + + table_list = [{"table_id": t.table_id, "table_type": t.table_type} for t in tables] + + return jsonify({ + "dataset": dataset, + "tables": table_list, + "count": len(table_list) + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +@app.route("/process", methods=["POST"]) +def process_data(): + """ + Example data processing endpoint. + + Customize this endpoint for your specific use case. + """ + try: + data = request.get_json() + + # TODO: Add your processing logic here + result = { + "input": data, + "processed": True, + "message": "Processing complete" + } + + return jsonify(result) + except Exception as e: + return jsonify({"error": str(e)}), 500 + + +# ============================================================================= +# MAIN +# ============================================================================= + +if __name__ == "__main__": + port = int(os.environ.get("PORT", 8080)) + app.run(host="0.0.0.0", port=port, debug=False) diff --git a/features/src/llm-context/templates/flask-api/app/requirements.txt b/features/src/llm-context/templates/flask-api/app/requirements.txt new file mode 100644 index 000000000..f283a3b96 --- /dev/null +++ b/features/src/llm-context/templates/flask-api/app/requirements.txt @@ -0,0 +1,6 @@ +flask==3.0.0 +gunicorn==21.2.0 +google-cloud-storage==2.14.0 +google-cloud-bigquery==3.14.0 +pandas==2.1.4 +pyarrow==14.0.2 diff --git a/features/src/llm-context/templates/flask-api/devcontainer-template.json b/features/src/llm-context/templates/flask-api/devcontainer-template.json new file mode 100644 index 000000000..c82d57371 --- /dev/null +++ b/features/src/llm-context/templates/flask-api/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "flask-api", + "version": "1.0.0", + "name": "Flask REST API", + "description": "REST API with Flask for data processing and backend services", + "documentationURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/master/src/templates/flask-api", + "licenseURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/features/src/llm-context/templates/flask-api/docker-compose.yaml b/features/src/llm-context/templates/flask-api/docker-compose.yaml new file mode 100644 index 000000000..b442a1c87 --- /dev/null +++ b/features/src/llm-context/templates/flask-api/docker-compose.yaml @@ -0,0 +1,30 @@ +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + volumes: + - .:/workspace:cached + - app-data:/home/appuser/data + ports: + - "8080:8080" + environment: + - FLASK_ENV=production + - PYTHONUNBUFFERED=1 + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + +volumes: + app-data: + +networks: + app-network: + external: true diff --git a/features/src/llm-context/templates/flask-api/manifest.yaml b/features/src/llm-context/templates/flask-api/manifest.yaml new file mode 100644 index 000000000..c9003e95c --- /dev/null +++ b/features/src/llm-context/templates/flask-api/manifest.yaml @@ -0,0 +1,41 @@ +id: flask-api +name: Flask REST API +description: REST API with Flask for data processing and backend services +version: 1.0.0 + +capabilities: + - rest-api + - json-processing + - file-upload + - bigquery-access + - gcs-access + - authentication + +inputs: + - name: app_name + type: string + required: true + description: Name of the application + default: "my-api" + + - name: endpoints + type: list + required: false + description: API endpoints to create + default: ["/health", "/process"] + + - name: gcs_buckets + type: list[resource] + resource_type: GCS_BUCKET + required: false + description: GCS buckets to access + + - name: bq_datasets + type: list[resource] + resource_type: BQ_DATASET + required: false + description: BigQuery datasets to access + +complexity: simple +estimated_build_time: 3min +port: 8080 diff --git a/features/src/llm-context/templates/rshiny-dashboard/.devcontainer.json b/features/src/llm-context/templates/rshiny-dashboard/.devcontainer.json new file mode 100644 index 000000000..5f5d36caa --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/.devcontainer.json @@ -0,0 +1,35 @@ +{ + "name": "RShiny Dashboard", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": [ + "bash", "-c", + "./startupscript/post-startup.sh shiny /home/shiny \"${templateOption:cloud}\" \"${templateOption:login}\"" + ], + "postStartCommand": [ + "bash", "-c", + "./startupscript/remount-on-restart.sh shiny /home/shiny \"${templateOption:cloud}\" \"${templateOption:login}\"" + ], + "features": { + "./.devcontainer/features/workbench-tools": { + "cloud": "${templateOption:cloud}", + "username": "shiny", + "userHomeDir": "/home/shiny" + }, + "./.devcontainer/features/llm-context": { + "username": "shiny", + "userHomeDir": "/home/shiny" + } + }, + "remoteUser": "root", + "customizations": { + "workbench": { + "proxyTargetPort": 3838, + "opens": { + "extensions": [".R", ".Rmd", ".json", ".yaml", ".md", ".csv"] + } + } + } +} diff --git a/features/src/llm-context/templates/rshiny-dashboard/Dockerfile b/features/src/llm-context/templates/rshiny-dashboard/Dockerfile new file mode 100644 index 000000000..7ff70049a --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/Dockerfile @@ -0,0 +1,40 @@ +FROM rocker/shiny:4.3.2 + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + libcurl4-openssl-dev \ + libssl-dev \ + libxml2-dev \ + fuse \ + && rm -rf /var/lib/apt/lists/* + +# Install R packages +RUN R -e "install.packages(c( \ + 'shiny', \ + 'shinydashboard', \ + 'DT', \ + 'plotly', \ + 'ggplot2', \ + 'dplyr', \ + 'tidyr', \ + 'bigrquery', \ + 'googleCloudStorageR' \ +), repos='https://cran.rstudio.com/')" + +# Create app directory +RUN mkdir -p /srv/shiny-server/app + +# Copy application +COPY app/ /srv/shiny-server/ + +# Copy Shiny server config +COPY shiny-server.conf /etc/shiny-server/shiny-server.conf + +# Set permissions +RUN chown -R shiny:shiny /srv/shiny-server + +EXPOSE 3838 + +USER shiny + +CMD ["/usr/bin/shiny-server"] diff --git a/features/src/llm-context/templates/rshiny-dashboard/README.md b/features/src/llm-context/templates/rshiny-dashboard/README.md new file mode 100644 index 000000000..69757368f --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/README.md @@ -0,0 +1,72 @@ +# RShiny Dashboard Template + +An interactive R-based dashboard template for Verily Workbench with Shiny. + +## Features + +- **Data Explorer**: Upload and explore CSV files +- **Visualization**: Create interactive charts with plotly +- **Workspace Resources**: View connected buckets and datasets +- **R Statistical Analysis**: Full R environment for data analysis + +## Tabs + +| Tab | Description | +|-----|-------------| +| Overview | Dashboard summary with resource counts | +| Data Explorer | Upload CSV files, view data tables | +| Visualization | Create scatter, line, bar, histogram charts | +| Resources | View all workspace resources | + +## R Packages Included + +- `shiny` & `shinydashboard` - UI framework +- `DT` - Interactive data tables +- `plotly` & `ggplot2` - Visualization +- `dplyr` & `tidyr` - Data manipulation +- `bigrquery` - BigQuery integration +- `googleCloudStorageR` - GCS integration + +## Customization + +1. Edit `app/app.R` to add new features +2. Modify `Dockerfile` to add R packages +3. Update dashboard layout in the UI section + +## Local Testing + +```bash +R -e "shiny::runApp('app', port=3838)" +``` + +## Workspace Resources + +Access workspace resources via environment variables: +- `WORKBENCH_` contains the resource path +- Use `Sys.getenv()` to access in R code + +## BigQuery Access Example + +```r +library(bigrquery) + +# Run a query +query <- "SELECT * FROM `project.dataset.table` LIMIT 100" +result <- bq_project_query("your-project", query) +df <- bq_table_download(result) +``` + +## GCS Access Example + +```r +library(googleCloudStorageR) + +# Set bucket +gcs_global_bucket("your-bucket-name") + +# List objects +objects <- gcs_list_objects() + +# Download file +gcs_get_object("path/to/file.csv", saveToDisk = "local_file.csv") +``` diff --git a/features/src/llm-context/templates/rshiny-dashboard/app/app.R b/features/src/llm-context/templates/rshiny-dashboard/app/app.R new file mode 100644 index 000000000..8a607b69f --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/app/app.R @@ -0,0 +1,222 @@ +# ============================================================================= +# RShiny Dashboard Template for Verily Workbench +# ============================================================================= + +library(shiny) +library(shinydashboard) +library(DT) +library(plotly) +library(ggplot2) +library(dplyr) + +# ============================================================================= +# WORKSPACE HELPERS +# ============================================================================= + +get_workspace_resources <- function() { + env_vars <- Sys.getenv() + workbench_vars <- env_vars[grepl("^WORKBENCH_", names(env_vars))] + names(workbench_vars) <- gsub("^WORKBENCH_", "", names(workbench_vars)) + as.list(workbench_vars) +} + +# Get workspace resources +resources <- get_workspace_resources() + +# ============================================================================= +# UI +# ============================================================================= + +ui <- dashboardPage( + dashboardHeader(title = "Workbench Dashboard"), + + dashboardSidebar( + sidebarMenu( + menuItem("Overview", tabName = "overview", icon = icon("dashboard")), + menuItem("Data Explorer", tabName = "data", icon = icon("table")), + menuItem("Visualization", tabName = "viz", icon = icon("chart-line")), + menuItem("Resources", tabName = "resources", icon = icon("cloud")) + ) + ), + + dashboardBody( + tabItems( + # Overview Tab + tabItem( + tabName = "overview", + fluidRow( + box( + title = "Welcome to Your Workbench Dashboard", + status = "primary", + solidHeader = TRUE, + width = 12, + p("This RShiny template integrates with your Workbench workspace resources."), + p("Use the sidebar to navigate between data exploration and visualization.") + ) + ), + fluidRow( + valueBoxOutput("resource_count"), + valueBoxOutput("bucket_count"), + valueBoxOutput("dataset_count") + ) + ), + + # Data Explorer Tab + tabItem( + tabName = "data", + fluidRow( + box( + title = "Upload Data", + status = "info", + solidHeader = TRUE, + width = 4, + fileInput("file_upload", "Choose CSV File", accept = ".csv"), + actionButton("load_data", "Load Data", class = "btn-primary") + ), + box( + title = "Data Preview", + status = "success", + solidHeader = TRUE, + width = 8, + DTOutput("data_table") + ) + ) + ), + + # Visualization Tab + tabItem( + tabName = "viz", + fluidRow( + box( + title = "Chart Settings", + status = "warning", + solidHeader = TRUE, + width = 3, + selectInput("x_var", "X Variable", choices = NULL), + selectInput("y_var", "Y Variable", choices = NULL), + selectInput("chart_type", "Chart Type", + choices = c("Scatter", "Line", "Bar", "Histogram")), + actionButton("create_chart", "Create Chart", class = "btn-success") + ), + box( + title = "Chart", + status = "primary", + solidHeader = TRUE, + width = 9, + plotlyOutput("main_chart", height = "500px") + ) + ) + ), + + # Resources Tab + tabItem( + tabName = "resources", + fluidRow( + box( + title = "Workspace Resources", + status = "info", + solidHeader = TRUE, + width = 12, + DTOutput("resources_table") + ) + ) + ) + ) + ) +) + +# ============================================================================= +# SERVER +# ============================================================================= + +server <- function(input, output, session) { + + # Reactive values + data <- reactiveVal(NULL) + + # Load data from file upload + observeEvent(input$load_data, { + req(input$file_upload) + df <- read.csv(input$file_upload$datapath) + data(df) + + # Update variable selectors + updateSelectInput(session, "x_var", choices = names(df)) + updateSelectInput(session, "y_var", choices = names(df)) + }) + + # Data table output + output$data_table <- renderDT({ + req(data()) + datatable(data(), options = list(pageLength = 10, scrollX = TRUE)) + }) + + # Value boxes + output$resource_count <- renderValueBox({ + valueBox( + length(resources), + "Workspace Resources", + icon = icon("folder"), + color = "blue" + ) + }) + + output$bucket_count <- renderValueBox({ + bucket_count <- sum(grepl("^gs://", unlist(resources))) + valueBox( + bucket_count, + "GCS Buckets", + icon = icon("cloud"), + color = "green" + ) + }) + + output$dataset_count <- renderValueBox({ + dataset_count <- sum(grepl("bigquery://", unlist(resources))) + valueBox( + dataset_count, + "BigQuery Datasets", + icon = icon("database"), + color = "purple" + ) + }) + + # Resources table + output$resources_table <- renderDT({ + df <- data.frame( + Name = names(resources), + Path = unlist(resources), + stringsAsFactors = FALSE + ) + datatable(df, options = list(pageLength = 20)) + }) + + # Create chart + observeEvent(input$create_chart, { + req(data(), input$x_var, input$y_var) + + df <- data() + + output$main_chart <- renderPlotly({ + p <- switch( + input$chart_type, + "Scatter" = ggplot(df, aes_string(x = input$x_var, y = input$y_var)) + + geom_point(alpha = 0.6), + "Line" = ggplot(df, aes_string(x = input$x_var, y = input$y_var)) + + geom_line(), + "Bar" = ggplot(df, aes_string(x = input$x_var, y = input$y_var)) + + geom_bar(stat = "identity"), + "Histogram" = ggplot(df, aes_string(x = input$x_var)) + + geom_histogram(bins = 30) + ) + + ggplotly(p + theme_minimal()) + }) + }) +} + +# ============================================================================= +# RUN APP +# ============================================================================= + +shinyApp(ui = ui, server = server) diff --git a/features/src/llm-context/templates/rshiny-dashboard/devcontainer-template.json b/features/src/llm-context/templates/rshiny-dashboard/devcontainer-template.json new file mode 100644 index 000000000..e2947a096 --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "rshiny-dashboard", + "version": "1.0.0", + "name": "RShiny Dashboard", + "description": "Interactive R-based dashboard with Shiny for statistical analysis and visualization", + "documentationURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/master/src/templates/rshiny-dashboard", + "licenseURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/features/src/llm-context/templates/rshiny-dashboard/docker-compose.yaml b/features/src/llm-context/templates/rshiny-dashboard/docker-compose.yaml new file mode 100644 index 000000000..7802142d7 --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/docker-compose.yaml @@ -0,0 +1,29 @@ +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + volumes: + - .:/workspace:cached + - app-data:/home/shiny/data + ports: + - "3838:3838" + environment: + - SHINY_LOG_LEVEL=TRACE + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + +volumes: + app-data: + +networks: + app-network: + external: true diff --git a/features/src/llm-context/templates/rshiny-dashboard/manifest.yaml b/features/src/llm-context/templates/rshiny-dashboard/manifest.yaml new file mode 100644 index 000000000..9d69bfda3 --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/manifest.yaml @@ -0,0 +1,39 @@ +id: rshiny-dashboard +name: RShiny Dashboard +description: Interactive R-based dashboard with Shiny +version: 1.0.0 + +capabilities: + - data-visualization + - interactive-ui + - statistical-analysis + - r-language + - bigquery-access + - gcs-access + +inputs: + - name: app_name + type: string + required: true + default: "my-shiny-app" + + - name: dashboard_title + type: string + required: false + default: "R Shiny Dashboard" + + - name: gcs_buckets + type: list[resource] + resource_type: GCS_BUCKET + required: false + description: GCS buckets to access + + - name: bq_datasets + type: list[resource] + resource_type: BQ_DATASET + required: false + description: BigQuery datasets to access + +complexity: simple +estimated_build_time: 5min +port: 3838 diff --git a/features/src/llm-context/templates/rshiny-dashboard/shiny-server.conf b/features/src/llm-context/templates/rshiny-dashboard/shiny-server.conf new file mode 100644 index 000000000..b09f57a12 --- /dev/null +++ b/features/src/llm-context/templates/rshiny-dashboard/shiny-server.conf @@ -0,0 +1,14 @@ +# Define the user we should use when spawning R Shiny processes +run_as shiny; + +# Define a top-level server which will listen on a port +server { + listen 3838; + + # Define the location available at the base URL + location / { + site_dir /srv/shiny-server; + log_dir /var/log/shiny-server; + directory_index on; + } +} diff --git a/features/src/llm-context/templates/streamlit-dashboard/.devcontainer.json b/features/src/llm-context/templates/streamlit-dashboard/.devcontainer.json new file mode 100644 index 000000000..e4bda8a3c --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/.devcontainer.json @@ -0,0 +1,35 @@ +{ + "name": "Streamlit Dashboard", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": [ + "bash", "-c", + "./startupscript/post-startup.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" + ], + "postStartCommand": [ + "bash", "-c", + "./startupscript/remount-on-restart.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" + ], + "features": { + "./.devcontainer/features/workbench-tools": { + "cloud": "${templateOption:cloud}", + "username": "appuser", + "userHomeDir": "/home/appuser" + }, + "./.devcontainer/features/llm-context": { + "username": "appuser", + "userHomeDir": "/home/appuser" + } + }, + "remoteUser": "root", + "customizations": { + "workbench": { + "proxyTargetPort": 8501, + "opens": { + "extensions": [".py", ".json", ".yaml", ".md", ".csv"] + } + } + } +} diff --git a/features/src/llm-context/templates/streamlit-dashboard/Dockerfile b/features/src/llm-context/templates/streamlit-dashboard/Dockerfile new file mode 100644 index 000000000..d0fbbb7d0 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11-slim + +RUN groupadd -r appuser && useradd -r -g appuser -d /home/appuser -m appuser + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl fuse \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app +COPY app/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY app/ . +RUN chown -R appuser:appuser /app /home/appuser + +EXPOSE 8501 +USER appuser + +CMD ["streamlit", "run", "main.py", "--server.port=8501", "--server.address=0.0.0.0"] diff --git a/features/src/llm-context/templates/streamlit-dashboard/README.md b/features/src/llm-context/templates/streamlit-dashboard/README.md new file mode 100644 index 000000000..afe1f5e63 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/README.md @@ -0,0 +1,43 @@ +# Streamlit Dashboard Template + +An interactive data dashboard template for Verily Workbench with GCS and BigQuery integration. + +## Features + +- **GCS File Browser**: Browse and preview files from workspace buckets +- **BigQuery Explorer**: Run SQL queries and view results +- **Data Visualization**: Create charts from uploaded CSV or query results +- **Workspace Resources**: Auto-discovery of workspace buckets and datasets + +## Tabs + +| Tab | Description | +|-----|-------------| +| GCS Files | Browse bucket contents, preview CSV files | +| BigQuery | Run SQL queries, view results in tables | +| Visualize | Create line, bar, or scatter charts | + +## Customization + +1. Edit `app/main.py` to add new visualizations +2. Update `app/requirements.txt` for additional libraries +3. Add new tabs for custom functionality + +## Local Testing + +```bash +cd app && pip install -r requirements.txt && streamlit run main.py +``` + +## Workspace Resources + +Access workspace resources via environment variables: +- `WORKBENCH_` contains the resource path +- Resources are auto-displayed in the sidebar + +## Example Usage + +1. Select a bucket from the sidebar +2. Browse files and preview CSVs +3. Run BigQuery queries in the BigQuery tab +4. Visualize data in the Visualize tab diff --git a/features/src/llm-context/templates/streamlit-dashboard/app/main.py b/features/src/llm-context/templates/streamlit-dashboard/app/main.py new file mode 100644 index 000000000..775a26f65 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/app/main.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +""" +Streamlit Dashboard Template for Verily Workbench + +Interactive data visualization with workspace resource integration. +""" + +import os +import streamlit as st +import pandas as pd +from google.cloud import storage, bigquery + +# ============================================================================= +# PAGE CONFIG +# ============================================================================= + +st.set_page_config( + page_title="Workbench Dashboard", + page_icon="📊", + layout="wide" +) + +# ============================================================================= +# WORKSPACE HELPERS +# ============================================================================= + +@st.cache_resource +def get_gcs_client(): + return storage.Client() + +@st.cache_resource +def get_bq_client(): + return bigquery.Client() + +def get_workspace_resources(): + """Get all WORKBENCH_ environment variables.""" + return { + k.replace("WORKBENCH_", ""): v + for k, v in os.environ.items() + if k.startswith("WORKBENCH_") + } + +# ============================================================================= +# SIDEBAR: RESOURCE BROWSER +# ============================================================================= + +st.sidebar.title("🗂️ Workspace Resources") + +resources = get_workspace_resources() +if resources: + st.sidebar.markdown("**Available Resources:**") + for name, path in resources.items(): + st.sidebar.code(f"{name}: {path}") +else: + st.sidebar.info("No workspace resources found") + +# ============================================================================= +# MAIN CONTENT +# ============================================================================= + +st.title("📊 Data Dashboard") +st.markdown("Interactive data exploration for your Workbench workspace") + +# Tabs for different data sources +tab1, tab2, tab3 = st.tabs(["📁 GCS Files", "📊 BigQuery", "📈 Visualize"]) + +# ----------------------------------------------------------------------------- +# TAB 1: GCS FILE BROWSER +# ----------------------------------------------------------------------------- + +with tab1: + st.header("Cloud Storage Browser") + + # Get buckets from workspace resources + buckets = [v for k, v in resources.items() if v.startswith("gs://")] + + if buckets: + selected_bucket = st.selectbox("Select Bucket", buckets) + + if selected_bucket: + bucket_name = selected_bucket.replace("gs://", "") + + try: + client = get_gcs_client() + bucket = client.bucket(bucket_name) + blobs = list(bucket.list_blobs(max_results=100)) + + if blobs: + files_df = pd.DataFrame([ + {"Name": b.name, "Size (KB)": b.size / 1024, "Updated": b.updated} + for b in blobs + ]) + st.dataframe(files_df, use_container_width=True) + + # File preview + csv_files = [b.name for b in blobs if b.name.endswith('.csv')] + if csv_files: + selected_file = st.selectbox("Preview CSV", csv_files) + if st.button("Load File"): + blob = bucket.blob(selected_file) + data = blob.download_as_text() + df = pd.read_csv(pd.io.common.StringIO(data)) + st.dataframe(df.head(100)) + else: + st.info("Bucket is empty") + except Exception as e: + st.error(f"Error accessing bucket: {e}") + else: + st.info("No GCS buckets found in workspace resources") + +# ----------------------------------------------------------------------------- +# TAB 2: BIGQUERY EXPLORER +# ----------------------------------------------------------------------------- + +with tab2: + st.header("BigQuery Explorer") + + query = st.text_area( + "Enter SQL Query", + value="SELECT * FROM `your-project.your-dataset.your-table` LIMIT 100", + height=150 + ) + + if st.button("Run Query"): + try: + client = get_bq_client() + with st.spinner("Running query..."): + df = client.query(query).to_dataframe() + + st.success(f"Query returned {len(df)} rows") + st.dataframe(df, use_container_width=True) + + # Store in session state for visualization + st.session_state["query_result"] = df + except Exception as e: + st.error(f"Query error: {e}") + +# ----------------------------------------------------------------------------- +# TAB 3: VISUALIZATION +# ----------------------------------------------------------------------------- + +with tab3: + st.header("Data Visualization") + + # File uploader for local CSV + uploaded_file = st.file_uploader("Upload CSV", type=["csv"]) + + if uploaded_file: + df = pd.read_csv(uploaded_file) + st.session_state["viz_data"] = df + + # Use query results or uploaded data + if "viz_data" in st.session_state: + df = st.session_state["viz_data"] + elif "query_result" in st.session_state: + df = st.session_state["query_result"] + else: + st.info("Upload a CSV or run a BigQuery query to visualize data") + st.stop() + + # Column selection + col1, col2 = st.columns(2) + with col1: + x_col = st.selectbox("X Axis", df.columns) + with col2: + y_col = st.selectbox("Y Axis", [c for c in df.columns if c != x_col]) + + chart_type = st.radio("Chart Type", ["Line", "Bar", "Scatter"], horizontal=True) + + # Create chart + if chart_type == "Line": + st.line_chart(df.set_index(x_col)[y_col]) + elif chart_type == "Bar": + st.bar_chart(df.set_index(x_col)[y_col]) + else: + st.scatter_chart(df, x=x_col, y=y_col) + +# ============================================================================= +# FOOTER +# ============================================================================= + +st.markdown("---") +st.caption("Powered by Streamlit | Verily Workbench") diff --git a/features/src/llm-context/templates/streamlit-dashboard/app/requirements.txt b/features/src/llm-context/templates/streamlit-dashboard/app/requirements.txt new file mode 100644 index 000000000..cf28aae9a --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/app/requirements.txt @@ -0,0 +1,7 @@ +streamlit==1.29.0 +google-cloud-storage==2.14.0 +google-cloud-bigquery==3.14.0 +pandas==2.1.4 +plotly==5.18.0 +altair==5.2.0 +pyarrow==14.0.2 diff --git a/features/src/llm-context/templates/streamlit-dashboard/devcontainer-template.json b/features/src/llm-context/templates/streamlit-dashboard/devcontainer-template.json new file mode 100644 index 000000000..6333709c4 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "streamlit-dashboard", + "version": "1.0.0", + "name": "Streamlit Dashboard", + "description": "Interactive data dashboard with Streamlit for visualization and exploration", + "documentationURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/master/src/templates/streamlit-dashboard", + "licenseURL": "https://github.com/aculotti-verily/wb-app-mcp-and-context/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/features/src/llm-context/templates/streamlit-dashboard/docker-compose.yaml b/features/src/llm-context/templates/streamlit-dashboard/docker-compose.yaml new file mode 100644 index 000000000..5eb6b6cac --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/docker-compose.yaml @@ -0,0 +1,31 @@ +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + volumes: + - .:/workspace:cached + - app-data:/home/appuser/data + ports: + - "8501:8501" + environment: + - STREAMLIT_SERVER_PORT=8501 + - STREAMLIT_SERVER_ADDRESS=0.0.0.0 + - STREAMLIT_SERVER_HEADLESS=true + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + +volumes: + app-data: + +networks: + app-network: + external: true diff --git a/features/src/llm-context/templates/streamlit-dashboard/manifest.yaml b/features/src/llm-context/templates/streamlit-dashboard/manifest.yaml new file mode 100644 index 000000000..cfbc11f86 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/manifest.yaml @@ -0,0 +1,39 @@ +id: streamlit-dashboard +name: Streamlit Dashboard +description: Interactive data dashboard with Streamlit +version: 1.0.0 + +capabilities: + - data-visualization + - interactive-ui + - file-upload + - bigquery-access + - gcs-access + - charts-and-graphs + +inputs: + - name: app_name + type: string + required: true + default: "my-dashboard" + + - name: dashboard_title + type: string + required: false + default: "Data Dashboard" + + - name: gcs_buckets + type: list[resource] + resource_type: GCS_BUCKET + required: false + description: GCS buckets to access + + - name: bq_datasets + type: list[resource] + resource_type: BQ_DATASET + required: false + description: BigQuery datasets to access + +complexity: simple +estimated_build_time: 3min +port: 8501 From 1307503cc57c1c89ac7708e857cb51eb6d89a75c Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 18 Feb 2026 12:06:45 -0500 Subject: [PATCH 20/86] Add critical URL formatting guidance for Workbench apps When generating URLs for apps/proxies inside Workbench, must use: https://workbench.verily.com/app/[UUID]/proxy/[PORT]/[PATH] Common mistake: Using localhost or custom domain patterns which fail with 'Bad Request' error. --- features/src/llm-context/generate-context.sh | 39 +++ features/src/llm-context/skills/CUSTOM_APP.md | 291 ++++++++++++++++++ 2 files changed, 330 insertions(+) create mode 100644 features/src/llm-context/skills/CUSTOM_APP.md diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 593c51bcd..b5a68eca7 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -1003,6 +1003,45 @@ wb app describe # App details --- +## ⚠️ Workbench App URLs (CRITICAL) + +**When generating URLs for apps, proxies, or any web content running inside this Workbench app, you MUST use this exact format:** + +\`\`\` +https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] +\`\`\` + +### Correct Examples +\`\`\` +https://workbench.verily.com/app/abc123-def456/proxy/8080/ +https://workbench.verily.com/app/abc123-def456/proxy/8501/index.html +https://workbench.verily.com/app/abc123-def456/proxy/3000/api/data +\`\`\` + +### ❌ WRONG Formats (These will fail with "Bad Request") +\`\`\` +https://abc123-def456.workbench-app.verily.com/ ← WRONG +https://workbench-app.verily.com/abc123-def456/ ← WRONG +http://localhost:8080/ ← WRONG (not accessible externally) +https://abc123-def456/workbench.verily.com/ ← WRONG +\`\`\` + +### How to Get the Current App UUID +\`\`\`bash +# The app UUID is in your current URL or can be found via: +echo \$WORKBENCH_APP_ID # If available as env var +\`\`\` + +### When to Use This +- Opening HTML files in a new tab +- Running Flask/Streamlit/Shiny apps +- Any code that serves content on a port +- Generating clickable links for users + +**Always construct URLs using the proxy format above, never localhost or custom domain patterns.** + +--- + ## Creating Custom Apps > **IMPORTANT: When a user asks to create an app, turn code into an app, or build something deployable, follow this decision process:** diff --git a/features/src/llm-context/skills/CUSTOM_APP.md b/features/src/llm-context/skills/CUSTOM_APP.md new file mode 100644 index 000000000..5906dd4ec --- /dev/null +++ b/features/src/llm-context/skills/CUSTOM_APP.md @@ -0,0 +1,291 @@ +# Creating Custom Workbench Apps + +**Practical guide for creating simple, reliable Workbench apps.** + +> **When to use this guide:** For simple apps (Flask APIs, static sites, custom tools). +> For apps needing Workbench CLI, gcloud, or Jupyter, see the [full-featured approach](https://github.com/verily-src/workbench-app-devcontainers). + +## TL;DR - The Minimal Pattern That Works + +Workbench custom apps need exactly **three things**: +1. Container named `application-server` +2. Connected to `app-network` (external Docker network) +3. HTTP server on a port + +**That's it.** Everything else is optional (and often causes problems). + +--- + +## The Minimal Working Pattern (Copy This) + +### File 1: `.devcontainer.json` +```json +{ + "name": "Your App Name", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "remoteUser": "root" +} +``` + +### File 2: `docker-compose.yaml` +```yaml +services: + app: + container_name: "application-server" + build: + context: ../.. + dockerfile: src/YOUR-APP-NAME/Dockerfile + restart: always + ports: + - "8080:8080" + networks: + - app-network + +networks: + app-network: + external: true +``` + +### File 3: `Dockerfile` +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +COPY src/YOUR-APP-NAME/app/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY src/YOUR-APP-NAME/app/ . + +EXPOSE 8080 + +CMD ["python", "your_app.py"] +``` + +### File 4: `devcontainer-template.json` +```json +{ + "id": "your-app-name", + "description": "Your app description", + "version": "1.0.0", + "name": "Your App Name", + "options": {}, + "platforms": ["Any"] +} +``` + +--- + +## Directory Structure + +``` +src/YOUR-APP-NAME/ +├── .devcontainer.json +├── devcontainer-template.json +├── docker-compose.yaml +├── Dockerfile +├── README.md +└── app/ + ├── your_app.py + ├── requirements.txt + └── (other files) +``` + +--- + +## What NOT To Do (Lessons Learned) + +### DON'T use complex base images unless needed +❌ `workbench-jupyter` base image - Has its own startup config that conflicts with CMD overrides +✅ `python:3.11-slim` - Clean, simple, no surprises + +### DON'T use devcontainer features +❌ Features like `ghcr.io/dhoeric/features/google-cloud-cli` - Uses deprecated `apt-key`, fails on newer Debian +❌ Features like `workbench-tools` - Expect specific system packages +✅ Install what you need directly in the Dockerfile + +### DON'T use postCreateCommand/postStartCommand +❌ `./startupscript/post-startup.sh` - Expects specific user/home structure, may fail +✅ Self-contained Dockerfile with everything built in + +### DON'T use supervisor for multiple processes (unless truly needed) +❌ Supervisor + Jupyter + Flask - Complex, many failure points +✅ Single process serving everything (Flask can serve static files) + +### DON'T fight with Jupyter config +❌ Overriding CMD on workbench-jupyter image - Causes `root_dir`/`file_to_run` conflicts +✅ Don't use Jupyter at all if you don't need it + +--- + +## Flask App: Serve Static Files Directly + +If your app has a Flask backend + static HTML, just have Flask serve everything: + +```python +import os +from flask import Flask +from flask_cors import CORS + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +app = Flask(__name__, static_folder=SCRIPT_DIR, static_url_path='/static') +CORS(app) + +@app.route('/') +def serve_index(): + return app.send_static_file('index.html') + +# ... your other routes ... + +if __name__ == '__main__': + app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +**No separate HTTP server needed. No supervisor. One process.** + +--- + +## Common Errors and Fixes + +### Error: `apt-key: command not found` +**Cause:** Devcontainer feature uses deprecated apt-key on newer Debian +**Fix:** Remove the feature from .devcontainer.json, install directly in Dockerfile if needed + +### Error: `root_dir and file_to_run are incompatible` +**Cause:** Overriding CMD on workbench-jupyter base image conflicts with its config +**Fix:** Don't use workbench-jupyter. Use python:3.11-slim instead + +### Error: `supports_credentials in conjunction with origin '*'` +**Cause:** Flask-CORS config conflict +**Fix:** Just use `CORS(app)` with no options + +### Error: Container restart loop +**Cause:** Main process exits immediately +**Fix:** Make sure your CMD runs a long-lived process (Flask server, not a script that exits) + +### Error: `Application-server port is empty` +**Cause:** Container not exposing port correctly, or app crashing before binding +**Fix:** Check `docker logs application-server` to see the actual error + +--- + +## Deployment + +### Deploy to Workbench +In Workbench UI, create custom app with: +- **Repository:** `git@github.com:YOUR-ORG/YOUR-REPO.git` +- **Branch:** `your-branch` +- **Folder:** `src/YOUR-APP-NAME` + +### For faster deploys (optional): Push to GAR +```bash +# Build +cd src/YOUR-APP-NAME +docker compose build + +# Tag +export TAG="us-central1-docker.pkg.dev/PROJECT/REPO/NAME:$(date +'%Y%m%d')" +docker tag YOUR-APP-NAME-app:latest ${TAG} + +# Push +docker push ${TAG} + +# Update docker-compose.yaml to use image: instead of build: +``` + +--- + +## ⚠️ Workbench App URLs (CRITICAL) + +**When accessing your app or generating URLs for users, you MUST use this format:** + +``` +https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] +``` + +### Correct Examples +``` +https://workbench.verily.com/app/abc123-def456/proxy/8080/ +https://workbench.verily.com/app/abc123-def456/proxy/8501/dashboard +``` + +### ❌ WRONG Formats (Will fail with "Bad Request") +``` +https://abc123-def456.workbench-app.verily.com/ ← WRONG +http://localhost:8080/ ← WRONG +``` + +**Always use the proxy URL format. Never use localhost or custom domain patterns.** + +--- + +## Local Testing + +```bash +# Create required network +docker network create app-network + +# Build and run +cd src/YOUR-APP-NAME +docker compose build +docker compose up + +# Access at http://localhost:8080 +``` + +--- + +## Debugging on VM + +```bash +# SSH to VM, then: +docker logs application-server --tail 100 +docker exec -it application-server /bin/sh +docker ps -a +``` + +--- + +## Reference Implementations + +All examples are in the public repo: https://github.com/verily-src/workbench-app-devcontainers + +| App | Description | Complexity | +|-----|-------------|------------| +| `src/playground/` | Multi-service app with Caddy | Simple | +| `src/vscode/` | VS Code Server on port 8443 | Pre-built image | +| `src/r-analysis/` | RStudio on port 8787 | Pre-built image | +| `src/workbench-jupyter/` | JupyterLab with Workbench tools | Full-featured | + +--- + +## When DO You Need Features? + +Sometimes you genuinely need the full-featured approach: + +| Need | Solution | +|------|----------| +| Workbench CLI (`wb`) | Use `workbench-tools` feature | +| LLM/MCP integration | Use `wb-mcp-server` feature | +| Pre-authenticated gcloud | Use `workbench-tools` feature | +| Jupyter notebooks | Use `workbench-jupyter` base image | + +**If you need these, accept the complexity.** But test thoroughly. + +--- + +## Key Insight + +The old guides suggested using `workbench-jupyter` base image + devcontainer features + startup scripts. This adds complexity that causes failures. + +The **playground pattern** proves you only need: +1. A container named `application-server` +2. On the `app-network` network +3. Serving HTTP on a port + +Everything else is optional convenience that often breaks. + +**When in doubt, simplify.** From 4c7907c0d86d9d6bee749ebff6557c448adc4057 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 18 Feb 2026 14:32:52 -0500 Subject: [PATCH 21/86] Add workspace_list_data_collections MCP tool New tool that answers: 'What data collections exist and what resources belong to them?' Implementation: 1. Gets all resources and identifies their sourceWorkspaceId 2. Looks up each source workspace to get the actual data collection name 3. Groups resources by their source data collection 4. Shows resources created directly in this workspace (no source) Returns structured JSON with: - dataCollections: map of collection name -> {sourceWorkspaceId, resources[]} - localResources: resources created in this workspace - summary: statistics This eliminates the need for LLMs to manually piece together resource lineage information. --- features/src/wb-mcp-server/main.go | 141 +++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index 82298eadc..d1b6c1c4a 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -373,6 +373,27 @@ var wbTools = []Tool{ Properties: map[string]interface{}{}, }, }, + { + Name: "workspace_list_data_collections", + Description: `List all data collections in the current workspace and their associated resources. + +Use this when a user asks: +- "What data collections exist in my workspace?" +- "Show me resources grouped by data collection" +- "Which resources came from which data collections?" + +This tool automatically: +1. Gets all resources and identifies their sourceWorkspaceId (where they were cloned from) +2. Looks up each source workspace to get the actual data collection name +3. Groups resources by their source data collection +4. Shows resources created directly in this workspace (no source) + +Returns a structured list of data collections with their resources, types, and cloud paths.`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{}, + }, + }, { Name: "group_create", @@ -2404,6 +2425,126 @@ func handleCallTool(params CallToolParams) CallToolResult { case "folder_list_tree": output, err = executeWbCommand([]string{"folder", "tree"}) + case "workspace_list_data_collections": + // Get all resources with their metadata (includes sourceWorkspaceId) + resourcesOutput, resourcesErr := executeWbCommand([]string{"resource", "list", "--format=json"}) + if resourcesErr != nil { + err = fmt.Errorf("failed to list resources: %w", resourcesErr) + break + } + + // Parse resources + var resources []map[string]interface{} + if jsonErr := json.Unmarshal([]byte(resourcesOutput), &resources); jsonErr != nil { + // Try parsing as object with "resources" key + var resourceResponse map[string]interface{} + if err2 := json.Unmarshal([]byte(resourcesOutput), &resourceResponse); err2 == nil { + if r, ok := resourceResponse["resources"].([]interface{}); ok { + for _, item := range r { + if m, ok := item.(map[string]interface{}); ok { + resources = append(resources, m) + } + } + } + } + } + + // Collect unique sourceWorkspaceIds + sourceWorkspaceIds := make(map[string]bool) + for _, resource := range resources { + if sourceId, ok := resource["sourceWorkspaceId"].(string); ok && sourceId != "" { + sourceWorkspaceIds[sourceId] = true + } + } + + // Look up each source workspace to get the data collection name + dataCollectionNames := make(map[string]string) // sourceWorkspaceId -> display name + for sourceId := range sourceWorkspaceIds { + wsOutput, wsErr := executeWbCommand([]string{"workspace", "describe", "--workspace=" + sourceId, "--format=json"}) + if wsErr == nil { + var wsInfo map[string]interface{} + if json.Unmarshal([]byte(wsOutput), &wsInfo) == nil { + // Try to get display name, fall back to id + if displayName, ok := wsInfo["displayName"].(string); ok && displayName != "" { + dataCollectionNames[sourceId] = displayName + } else if name, ok := wsInfo["name"].(string); ok && name != "" { + dataCollectionNames[sourceId] = name + } else if id, ok := wsInfo["id"].(string); ok { + dataCollectionNames[sourceId] = id + } else { + dataCollectionNames[sourceId] = sourceId + } + } else { + dataCollectionNames[sourceId] = sourceId + } + } else { + // If we can't access the source workspace, use the ID + dataCollectionNames[sourceId] = sourceId + " (inaccessible)" + } + } + + // Group resources by data collection (sourceWorkspaceId) + dataCollections := make(map[string]map[string]interface{}) + localResources := []map[string]interface{}{} + + for _, resource := range resources { + resourceInfo := map[string]interface{}{ + "name": resource["name"], + "type": resource["resourceType"], + } + + // Add cloud path if available + if metadata, ok := resource["metadata"].(map[string]interface{}); ok { + if bucket, ok := metadata["bucketName"].(string); ok { + resourceInfo["path"] = "gs://" + bucket + } else if dataset, ok := metadata["datasetId"].(string); ok { + if project, ok := metadata["projectId"].(string); ok { + resourceInfo["path"] = project + ":" + dataset + } + } else if gcsBucket, ok := metadata["gcsBucketName"].(string); ok { + resourceInfo["path"] = "gs://" + gcsBucket + } + } + + // Check if resource came from a data collection (has sourceWorkspaceId) + if sourceId, ok := resource["sourceWorkspaceId"].(string); ok && sourceId != "" { + collectionName := dataCollectionNames[sourceId] + if dataCollections[collectionName] == nil { + dataCollections[collectionName] = map[string]interface{}{ + "sourceWorkspaceId": sourceId, + "resources": []map[string]interface{}{}, + } + } + resources := dataCollections[collectionName]["resources"].([]map[string]interface{}) + dataCollections[collectionName]["resources"] = append(resources, resourceInfo) + } else { + localResources = append(localResources, resourceInfo) + } + } + + // Count resources in collections + resourcesInCollections := 0 + for _, dc := range dataCollections { + if res, ok := dc["resources"].([]map[string]interface{}); ok { + resourcesInCollections += len(res) + } + } + + // Build output + result := map[string]interface{}{ + "dataCollections": dataCollections, + "localResources": localResources, + "summary": map[string]interface{}{ + "totalDataCollections": len(dataCollections), + "totalResources": len(resources), + "resourcesFromCollections": resourcesInCollections, + "resourcesCreatedLocally": len(localResources), + }, + } + + outputBytes, _ := json.MarshalIndent(result, "", " ") + output = string(outputBytes) + case "group_create": groupId := params.Arguments["groupId"].(string) name := params.Arguments["name"].(string) From 86eba95b9518ff5e9081a03fbdd0e6d3e8fc63e0 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 18 Feb 2026 15:59:26 -0500 Subject: [PATCH 22/86] Add dashboard serving guidance to CLAUDE.md and update CUSTOM_APP.md CLAUDE.md changes: - Renamed section: 'Workbench URLs, Dashboards & Interactive Content' - Explains why file:// URLs don't work (JS blocked by browser) - Shows how to use Python HTTP server for HTML dashboards - Quick recipe for building interactive visualizations - Common ports table and pro tips CUSTOM_APP.md changes: - Added file:// to list of wrong URL formats - Added reference to CLAUDE.md for dashboard guidance This helps users who ask to 'build a dashboard' or 'visualize data' understand that they need an HTTP server, not direct file access. --- features/src/llm-context/generate-context.sh | 104 +++++++++++++++--- features/src/llm-context/skills/CUSTOM_APP.md | 3 + 2 files changed, 89 insertions(+), 18 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index b5a68eca7..a5b4d3612 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -1003,42 +1003,110 @@ wb app describe # App details --- -## ⚠️ Workbench App URLs (CRITICAL) +## ⚠️ Workbench URLs, Dashboards & Interactive Content (CRITICAL) -**When generating URLs for apps, proxies, or any web content running inside this Workbench app, you MUST use this exact format:** +**Use this section when a user wants to:** +- **Build a dashboard** or data visualization +- **Create interactive charts** (Plotly, D3.js, Bokeh, Chart.js) +- **Run HTML files** with JavaScript +- **Launch web apps** (Flask, Streamlit, Shiny, FastAPI) +- **Display any content** that needs to run in a browser + +### The Correct URL Format + +**All web content MUST be accessed via the Workbench proxy URL:** \`\`\` https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] \`\`\` -### Correct Examples +### ✅ Correct Examples \`\`\` https://workbench.verily.com/app/abc123-def456/proxy/8080/ https://workbench.verily.com/app/abc123-def456/proxy/8501/index.html -https://workbench.verily.com/app/abc123-def456/proxy/3000/api/data +https://workbench.verily.com/app/abc123-def456/proxy/8000/dashboard.html \`\`\` -### ❌ WRONG Formats (These will fail with "Bad Request") +### ❌ WRONG Formats (These will fail) \`\`\` -https://abc123-def456.workbench-app.verily.com/ ← WRONG -https://workbench-app.verily.com/abc123-def456/ ← WRONG -http://localhost:8080/ ← WRONG (not accessible externally) -https://abc123-def456/workbench.verily.com/ ← WRONG +https://abc123-def456.workbench-app.verily.com/ ← WRONG: Bad Request error +http://localhost:8080/ ← WRONG: Not accessible externally +file:///home/jupyter/dashboard.html ← WRONG: JavaScript blocked by browser \`\`\` -### How to Get the Current App UUID +### Why \`file://\` URLs Don't Work for Interactive Content + +**You cannot open HTML files directly using \`file://\` paths.** The browser blocks JavaScript execution for security reasons. This affects: +- HTML dashboards with charts (Plotly, D3.js, Chart.js) +- Interactive visualizations +- Any HTML with \` + + +

Data Dashboard

+
+ + + +''' + +with open('dashboard.html', 'w') as f: + f.write(html_content) + +print("✅ Dashboard created! Now run in terminal:") +print(" python3 -m http.server 8000") +print("") +print("Then access at:") +print(" https://workbench.verily.com/app/[APP_UUID]/proxy/8000/dashboard.html") +\`\`\` -**Always construct URLs using the proxy format above, never localhost or custom domain patterns.** +### Common Ports by Use Case +| Content Type | Suggested Port | Command | +|--------------|---------------|---------| +| HTML dashboards | 8000 | \`python3 -m http.server 8000\` | +| Streamlit apps | 8501 | \`streamlit run app.py\` | +| Flask/FastAPI | 8080 | \`flask run --port 8080\` | +| Shiny apps | 3838 | (configured in app) | + +### Pro Tips +1. **Keep server running** - The HTTP server must stay running in a terminal +2. **Use background mode** - \`python3 -m http.server 8000 &\` to run in background +3. **Check if port is in use** - \`lsof -i :8000\` before starting +4. **Kill existing server** - \`kill \$(lsof -t -i :8000)\` if port is occupied +5. **Get App UUID** - Check your browser URL or run \`echo \$WORKBENCH_APP_ID\` --- diff --git a/features/src/llm-context/skills/CUSTOM_APP.md b/features/src/llm-context/skills/CUSTOM_APP.md index 5906dd4ec..94648e764 100644 --- a/features/src/llm-context/skills/CUSTOM_APP.md +++ b/features/src/llm-context/skills/CUSTOM_APP.md @@ -216,10 +216,13 @@ https://workbench.verily.com/app/abc123-def456/proxy/8501/dashboard ``` https://abc123-def456.workbench-app.verily.com/ ← WRONG http://localhost:8080/ ← WRONG +file:///home/jupyter/dashboard.html ← WRONG (JS blocked) ``` **Always use the proxy URL format. Never use localhost or custom domain patterns.** +> **📊 Building dashboards or HTML visualizations?** See the "Workbench URLs, Dashboards & Interactive Content" section in `~/CLAUDE.md` for how to serve HTML files with JavaScript (requires HTTP server). + --- ## Local Testing From 60d84a6181bf621816ad8bc70f30ba933eaf9216 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 11:54:03 -0500 Subject: [PATCH 23/86] Fix MCP tool references and add workspace_list_data_collections two-step API flow - Updated CLAUDE.md template to use correct MCP tool names: - get_resource -> workspace_list_data_collections - list_resources -> workspace_list_resources - query_bigquery -> bq_execute - run_workflow -> workflow_job_run - Enhanced workspace_list_data_collections to use two-step API approach: 1. List all resources to get IDs 2. Get detailed info for each resource (includes resourceLineage) 3. Look up source workspace names for data collection grouping - URL construction matches existing MCP tools pattern --- features/src/llm-context/generate-context.sh | 20 ++- features/src/wb-mcp-server/main.go | 180 ++++++++++++++----- 2 files changed, 147 insertions(+), 53 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index a5b4d3612..f249742aa 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -687,12 +687,13 @@ Data collections are curated datasets in the Workbench catalog. When added to a Use the **MCP server** to find which data collection a resource came from: -1. **Use the MCP \`get_resource\` tool** to get full resource metadata including lineage -2. The \`resourceLineage\` array contains: +1. **Use the MCP \`workspace_list_data_collections\` tool** to get resources grouped by data collection +2. Or use \`workspace_list_resources\` with workspaceId to get full resource metadata +3. The \`resourceLineage\` object contains: - \`sourceWorkspaceId\`: UUID of the data collection - \`sourceResourceId\`: UUID of the original resource -**Example:** Ask "Use get_resource to get lineage for resource 'clinical-bq-dataset'" +**Example:** Ask "Use workspace_list_data_collections to show me which data collections my resources came from" The response includes: \`\`\`json @@ -927,11 +928,11 @@ This app has **two interfaces** to Workbench functionality: ### Example: Same Operation, Two Ways **List resources:** -- MCP: Use \`list_resources\` tool → returns JSON array +- MCP: Use \`workspace_list_resources\` tool → returns JSON array - CLI: Run \`wb resource list --format=json\` → parse stdout **Query BigQuery:** -- MCP: Use \`query_bigquery\` tool with SQL parameter → returns results +- MCP: Use \`bq_execute\` tool with query parameter → returns results - CLI: Run \`bq query --use_legacy_sql=false 'SELECT ...'\` → parse output --- @@ -942,10 +943,11 @@ The Workbench MCP server exposes these tools for programmatic LLM access: | MCP Tool | CLI Equivalent | Description | |----------|----------------|-------------| -| \`list_resources\` | \`wb resource list\` | List all resources in the workspace | -| \`get_resource\` | \`wb resource describe \` | Get details about a specific resource | -| \`query_bigquery\` | \`bq query\` | Run SQL queries against BigQuery | -| \`run_workflow\` | \`wb workflow run\` | Submit a WDL/Nextflow workflow | +| \`workspace_list_resources\` | \`wb resource list\` | List all resources in the workspace | +| \`workspace_list_data_collections\` | N/A | List data collections and their resources | +| \`resource_list_tree\` | \`wb resource list-tree\` | List resources organized by folder | +| \`bq_execute\` | \`bq query\` | Run SQL queries against BigQuery | +| \`workflow_job_run\` | \`wb workflow run\` | Submit a WDL/Nextflow workflow | | \`get_workflow_status\` | \`wb workflow describe\` | Check status of a workflow run | | \`build_cohort\` | *(UI only)* | Create a cohort using Data Explorer | | \`export_cohort\` | *(UI only)* | Export cohort data to a bucket | diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index d1b6c1c4a..b26a57978 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -2426,51 +2426,126 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand([]string{"folder", "tree"}) case "workspace_list_data_collections": - // Get all resources with their metadata (includes sourceWorkspaceId) - resourcesOutput, resourcesErr := executeWbCommand([]string{"resource", "list", "--format=json"}) - if resourcesErr != nil { - err = fmt.Errorf("failed to list resources: %w", resourcesErr) + // Get current workspace UUID first + statusOutput, statusErr := executeWbCommand([]string{"status", "--format=json"}) + if statusErr != nil { + err = fmt.Errorf("failed to get workspace status: %w", statusErr) + break + } + var statusData map[string]interface{} + if jsonErr := json.Unmarshal([]byte(statusOutput), &statusData); jsonErr != nil { + err = fmt.Errorf("failed to parse status: %w", jsonErr) + break + } + workspace, ok := statusData["workspace"].(map[string]interface{}) + if !ok { + err = fmt.Errorf("no workspace set - run 'wb workspace set ' first") + break + } + workspaceUuid, ok := workspace["id"].(string) + if !ok { + err = fmt.Errorf("could not get workspace UUID") break } - // Parse resources - var resources []map[string]interface{} - if jsonErr := json.Unmarshal([]byte(resourcesOutput), &resources); jsonErr != nil { - // Try parsing as object with "resources" key - var resourceResponse map[string]interface{} - if err2 := json.Unmarshal([]byte(resourcesOutput), &resourceResponse); err2 == nil { - if r, ok := resourceResponse["resources"].([]interface{}); ok { - for _, item := range r { - if m, ok := item.(map[string]interface{}); ok { - resources = append(resources, m) - } + // Step 1: List all resources to get resource IDs + resourcesUrl := fmt.Sprintf("%s/api/workspaces/v1/%s/resources?offset=0&limit=1000", workspaceBaseURL, workspaceUuid) + resourcesResp, apiErr := makeAPIRequest("GET", resourcesUrl, nil) + if apiErr != nil { + err = fmt.Errorf("failed to list resources via API: %w", apiErr) + break + } + + // Parse resources list + var resourcesData map[string]interface{} + if jsonErr := json.Unmarshal(resourcesResp, &resourcesData); jsonErr != nil { + err = fmt.Errorf("failed to parse resources: %w", jsonErr) + break + } + resourcesList, ok := resourcesData["resources"].([]interface{}) + if !ok { + resourcesList = []interface{}{} + } + + // Step 2: For each resource, get detailed info with lineage + // Build a map of resource ID -> detailed resource info (with lineage) + detailedResources := []map[string]interface{}{} + sourceWorkspaceIds := make(map[string]bool) + + for _, r := range resourcesList { + resource, ok := r.(map[string]interface{}) + if !ok { + continue + } + + // Get resource metadata to find the resource ID + metadata, ok := resource["metadata"].(map[string]interface{}) + if !ok { + continue + } + resourceId, ok := metadata["resourceId"].(string) + if !ok { + // Try alternative field names + if rid, ok := metadata["id"].(string); ok { + resourceId = rid + } else if rid, ok := resource["id"].(string); ok { + resourceId = rid + } else { + continue + } + } + + // Get detailed resource info (includes lineage) + detailUrl := fmt.Sprintf("%s/api/workspaces/v1/%s/resources/%s", workspaceBaseURL, workspaceUuid, resourceId) + detailResp, detailErr := makeAPIRequest("GET", detailUrl, nil) + if detailErr != nil { + // If we can't get details, use what we have from the list + detailedResources = append(detailedResources, resource) + continue + } + + var detailedResource map[string]interface{} + if json.Unmarshal(detailResp, &detailedResource) == nil { + detailedResources = append(detailedResources, detailedResource) + + // Extract sourceWorkspaceId from resourceLineage + if lineage, ok := detailedResource["resourceLineage"].(map[string]interface{}); ok { + if sourceId, ok := lineage["sourceWorkspaceId"].(string); ok && sourceId != "" { + sourceWorkspaceIds[sourceId] = true } } + } else { + detailedResources = append(detailedResources, resource) } } - // Collect unique sourceWorkspaceIds - sourceWorkspaceIds := make(map[string]bool) - for _, resource := range resources { - if sourceId, ok := resource["sourceWorkspaceId"].(string); ok && sourceId != "" { - sourceWorkspaceIds[sourceId] = true + // Also check the original list for any lineage info we might have missed + for _, r := range resourcesList { + resource, ok := r.(map[string]interface{}) + if !ok { + continue + } + if lineage, ok := resource["resourceLineage"].(map[string]interface{}); ok { + if sourceId, ok := lineage["sourceWorkspaceId"].(string); ok && sourceId != "" { + sourceWorkspaceIds[sourceId] = true + } } } // Look up each source workspace to get the data collection name dataCollectionNames := make(map[string]string) // sourceWorkspaceId -> display name for sourceId := range sourceWorkspaceIds { - wsOutput, wsErr := executeWbCommand([]string{"workspace", "describe", "--workspace=" + sourceId, "--format=json"}) + // Use API to get workspace details + wsUrl := fmt.Sprintf("%s/api/workspaces/v1/%s", workspaceBaseURL, sourceId) + wsResp, wsErr := makeAPIRequest("GET", wsUrl, nil) if wsErr == nil { var wsInfo map[string]interface{} - if json.Unmarshal([]byte(wsOutput), &wsInfo) == nil { + if json.Unmarshal(wsResp, &wsInfo) == nil { // Try to get display name, fall back to id if displayName, ok := wsInfo["displayName"].(string); ok && displayName != "" { dataCollectionNames[sourceId] = displayName - } else if name, ok := wsInfo["name"].(string); ok && name != "" { - dataCollectionNames[sourceId] = name - } else if id, ok := wsInfo["id"].(string); ok { - dataCollectionNames[sourceId] = id + } else if userFacingId, ok := wsInfo["userFacingId"].(string); ok && userFacingId != "" { + dataCollectionNames[sourceId] = userFacingId } else { dataCollectionNames[sourceId] = sourceId } @@ -2483,31 +2558,48 @@ func handleCallTool(params CallToolParams) CallToolResult { } } - // Group resources by data collection (sourceWorkspaceId) + // Group resources by data collection (using resourceLineage.sourceWorkspaceId) dataCollections := make(map[string]map[string]interface{}) localResources := []map[string]interface{}{} - for _, resource := range resources { - resourceInfo := map[string]interface{}{ - "name": resource["name"], - "type": resource["resourceType"], - } + for _, resource := range detailedResources { + // Get metadata from the resource + metadata, _ := resource["metadata"].(map[string]interface{}) - // Add cloud path if available - if metadata, ok := resource["metadata"].(map[string]interface{}); ok { + resourceInfo := map[string]interface{}{} + + // Extract name and type from metadata + if metadata != nil { + if name, ok := metadata["name"].(string); ok { + resourceInfo["name"] = name + } + if resType, ok := metadata["resourceType"].(string); ok { + resourceInfo["type"] = resType + } + // GCS bucket path if bucket, ok := metadata["bucketName"].(string); ok { resourceInfo["path"] = "gs://" + bucket - } else if dataset, ok := metadata["datasetId"].(string); ok { + } else if gcsBucket, ok := metadata["gcsBucketName"].(string); ok { + resourceInfo["path"] = "gs://" + gcsBucket + } + // BigQuery dataset path + if dataset, ok := metadata["datasetId"].(string); ok { if project, ok := metadata["projectId"].(string); ok { resourceInfo["path"] = project + ":" + dataset } - } else if gcsBucket, ok := metadata["gcsBucketName"].(string); ok { - resourceInfo["path"] = "gs://" + gcsBucket } } - // Check if resource came from a data collection (has sourceWorkspaceId) - if sourceId, ok := resource["sourceWorkspaceId"].(string); ok && sourceId != "" { + // Check resourceLineage for source workspace ID + var sourceId string + if lineage, ok := resource["resourceLineage"].(map[string]interface{}); ok { + if sid, ok := lineage["sourceWorkspaceId"].(string); ok && sid != "" { + sourceId = sid + } + } + + // Group by data collection or mark as local + if sourceId != "" { collectionName := dataCollectionNames[sourceId] if dataCollections[collectionName] == nil { dataCollections[collectionName] = map[string]interface{}{ @@ -2515,8 +2607,8 @@ func handleCallTool(params CallToolParams) CallToolResult { "resources": []map[string]interface{}{}, } } - resources := dataCollections[collectionName]["resources"].([]map[string]interface{}) - dataCollections[collectionName]["resources"] = append(resources, resourceInfo) + resList := dataCollections[collectionName]["resources"].([]map[string]interface{}) + dataCollections[collectionName]["resources"] = append(resList, resourceInfo) } else { localResources = append(localResources, resourceInfo) } @@ -2535,10 +2627,10 @@ func handleCallTool(params CallToolParams) CallToolResult { "dataCollections": dataCollections, "localResources": localResources, "summary": map[string]interface{}{ - "totalDataCollections": len(dataCollections), - "totalResources": len(resources), + "totalDataCollections": len(dataCollections), + "totalResources": len(detailedResources), "resourcesFromCollections": resourcesInCollections, - "resourcesCreatedLocally": len(localResources), + "resourcesCreatedLocally": len(localResources), }, } From 2dad97cf2c07f9796caa53ec7ddd98c753f52d7c Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 13:38:12 -0500 Subject: [PATCH 24/86] Fix workspace_list_data_collections to use resolveWorkspaceId The wb status output may return userFacingId instead of UUID. Now uses resolveWorkspaceId() like other working tools to properly convert the workspace ID to UUID before making API calls. --- features/src/wb-mcp-server/main.go | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index b26a57978..30f49c11d 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -2426,7 +2426,7 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand([]string{"folder", "tree"}) case "workspace_list_data_collections": - // Get current workspace UUID first + // Get current workspace from wb status statusOutput, statusErr := executeWbCommand([]string{"status", "--format=json"}) if statusErr != nil { err = fmt.Errorf("failed to get workspace status: %w", statusErr) @@ -2442,9 +2442,20 @@ func handleCallTool(params CallToolParams) CallToolResult { err = fmt.Errorf("no workspace set - run 'wb workspace set ' first") break } - workspaceUuid, ok := workspace["id"].(string) - if !ok { - err = fmt.Errorf("could not get workspace UUID") + // Get either userFacingId or id from the workspace status + workspaceId := "" + if ufid, ok := workspace["userFacingId"].(string); ok && ufid != "" { + workspaceId = ufid + } else if id, ok := workspace["id"].(string); ok { + workspaceId = id + } else { + err = fmt.Errorf("could not get workspace ID from status") + break + } + // Resolve to UUID using the same method as other working tools + workspaceUuid, resolveErr := resolveWorkspaceId(workspaceId) + if resolveErr != nil { + err = fmt.Errorf("could not resolve workspace ID: %w", resolveErr) break } From f106dc8f89fd649a7428ba9856685c2f1c579f9b Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 13:49:07 -0500 Subject: [PATCH 25/86] Fix workspace_list_data_collections: resourceLineage is an array inside metadata Key fixes: - resourceLineage is an ARRAY, not an object - resourceLineage is inside metadata, not at the top level - Simplified: removed two-step approach since list endpoint includes lineage - Now matches the working workspace_list_resources pattern --- features/src/wb-mcp-server/main.go | 114 ++++++++++------------------- 1 file changed, 38 insertions(+), 76 deletions(-) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index 30f49c11d..3b0c0fc53 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -2459,7 +2459,7 @@ func handleCallTool(params CallToolParams) CallToolResult { break } - // Step 1: List all resources to get resource IDs + // List all resources (same API call as workspace_list_resources which works) resourcesUrl := fmt.Sprintf("%s/api/workspaces/v1/%s/resources?offset=0&limit=1000", workspaceBaseURL, workspaceUuid) resourcesResp, apiErr := makeAPIRequest("GET", resourcesUrl, nil) if apiErr != nil { @@ -2478,68 +2478,24 @@ func handleCallTool(params CallToolParams) CallToolResult { resourcesList = []interface{}{} } - // Step 2: For each resource, get detailed info with lineage - // Build a map of resource ID -> detailed resource info (with lineage) - detailedResources := []map[string]interface{}{} + // Extract sourceWorkspaceIds from resourceLineage (which is an ARRAY inside metadata) sourceWorkspaceIds := make(map[string]bool) - for _, r := range resourcesList { resource, ok := r.(map[string]interface{}) if !ok { continue } - - // Get resource metadata to find the resource ID metadata, ok := resource["metadata"].(map[string]interface{}) if !ok { continue } - resourceId, ok := metadata["resourceId"].(string) - if !ok { - // Try alternative field names - if rid, ok := metadata["id"].(string); ok { - resourceId = rid - } else if rid, ok := resource["id"].(string); ok { - resourceId = rid - } else { - continue - } - } - - // Get detailed resource info (includes lineage) - detailUrl := fmt.Sprintf("%s/api/workspaces/v1/%s/resources/%s", workspaceBaseURL, workspaceUuid, resourceId) - detailResp, detailErr := makeAPIRequest("GET", detailUrl, nil) - if detailErr != nil { - // If we can't get details, use what we have from the list - detailedResources = append(detailedResources, resource) - continue - } - - var detailedResource map[string]interface{} - if json.Unmarshal(detailResp, &detailedResource) == nil { - detailedResources = append(detailedResources, detailedResource) - - // Extract sourceWorkspaceId from resourceLineage - if lineage, ok := detailedResource["resourceLineage"].(map[string]interface{}); ok { - if sourceId, ok := lineage["sourceWorkspaceId"].(string); ok && sourceId != "" { + // resourceLineage is an array inside metadata + if lineageArray, ok := metadata["resourceLineage"].([]interface{}); ok && len(lineageArray) > 0 { + if firstLineage, ok := lineageArray[0].(map[string]interface{}); ok { + if sourceId, ok := firstLineage["sourceWorkspaceId"].(string); ok && sourceId != "" { sourceWorkspaceIds[sourceId] = true } } - } else { - detailedResources = append(detailedResources, resource) - } - } - - // Also check the original list for any lineage info we might have missed - for _, r := range resourcesList { - resource, ok := r.(map[string]interface{}) - if !ok { - continue - } - if lineage, ok := resource["resourceLineage"].(map[string]interface{}); ok { - if sourceId, ok := lineage["sourceWorkspaceId"].(string); ok && sourceId != "" { - sourceWorkspaceIds[sourceId] = true - } } } @@ -2569,43 +2525,49 @@ func handleCallTool(params CallToolParams) CallToolResult { } } - // Group resources by data collection (using resourceLineage.sourceWorkspaceId) + // Group resources by data collection (using resourceLineage array inside metadata) dataCollections := make(map[string]map[string]interface{}) localResources := []map[string]interface{}{} - for _, resource := range detailedResources { - // Get metadata from the resource + for _, r := range resourcesList { + resource, ok := r.(map[string]interface{}) + if !ok { + continue + } metadata, _ := resource["metadata"].(map[string]interface{}) + if metadata == nil { + continue + } resourceInfo := map[string]interface{}{} // Extract name and type from metadata - if metadata != nil { - if name, ok := metadata["name"].(string); ok { - resourceInfo["name"] = name - } - if resType, ok := metadata["resourceType"].(string); ok { - resourceInfo["type"] = resType - } - // GCS bucket path - if bucket, ok := metadata["bucketName"].(string); ok { - resourceInfo["path"] = "gs://" + bucket - } else if gcsBucket, ok := metadata["gcsBucketName"].(string); ok { - resourceInfo["path"] = "gs://" + gcsBucket - } - // BigQuery dataset path - if dataset, ok := metadata["datasetId"].(string); ok { - if project, ok := metadata["projectId"].(string); ok { - resourceInfo["path"] = project + ":" + dataset - } + if name, ok := metadata["name"].(string); ok { + resourceInfo["name"] = name + } + if resType, ok := metadata["resourceType"].(string); ok { + resourceInfo["type"] = resType + } + // GCS bucket path + if bucket, ok := metadata["bucketName"].(string); ok { + resourceInfo["path"] = "gs://" + bucket + } else if gcsBucket, ok := metadata["gcsBucketName"].(string); ok { + resourceInfo["path"] = "gs://" + gcsBucket + } + // BigQuery dataset path + if dataset, ok := metadata["datasetId"].(string); ok { + if project, ok := metadata["projectId"].(string); ok { + resourceInfo["path"] = project + ":" + dataset } } - // Check resourceLineage for source workspace ID + // Check resourceLineage array for source workspace ID var sourceId string - if lineage, ok := resource["resourceLineage"].(map[string]interface{}); ok { - if sid, ok := lineage["sourceWorkspaceId"].(string); ok && sid != "" { - sourceId = sid + if lineageArray, ok := metadata["resourceLineage"].([]interface{}); ok && len(lineageArray) > 0 { + if firstLineage, ok := lineageArray[0].(map[string]interface{}); ok { + if sid, ok := firstLineage["sourceWorkspaceId"].(string); ok && sid != "" { + sourceId = sid + } } } @@ -2639,7 +2601,7 @@ func handleCallTool(params CallToolParams) CallToolResult { "localResources": localResources, "summary": map[string]interface{}{ "totalDataCollections": len(dataCollections), - "totalResources": len(detailedResources), + "totalResources": len(resourcesList), "resourcesFromCollections": resourcesInCollections, "resourcesCreatedLocally": len(localResources), }, From 35f851e2a1fe87b4034fd5421f2e5ffe921e7393 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 15:10:00 -0500 Subject: [PATCH 26/86] Add explicit instructions to automatically get app UUID for URLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added 'How to Get the App UUID' section with command to get running app - Added ⚡ LLM INSTRUCTION to never ask user for UUID, always get it automatically - Updated Python dashboard example to show automatic UUID retrieval - Removed duplicate Pro Tip about UUID --- features/src/llm-context/generate-context.sh | 38 ++++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index f249742aa..f677b122e 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -1022,6 +1022,29 @@ wb app describe # App details https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] \`\`\` +### ⚠️ How to Get the App UUID (IMPORTANT) + +**You MUST automatically get the app UUID - NEVER ask the user for it.** + +**Option 1 (Preferred):** Run this command and use the output: +\`\`\`bash +wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 +\`\`\` + +**Option 2:** Check if environment variable is set: +\`\`\`bash +echo \$WORKBENCH_APP_ID +\`\`\` + +**Option 3:** Parse from browser URL (if user provides it): +The URL \`https://workbench.verily.com/app/abc123-def456/lab\` contains UUID \`abc123-def456\` + +**⚡ LLM INSTRUCTION:** When constructing dashboard/proxy URLs: +1. First run Option 1 command to get the running app UUID +2. Use that actual UUID in the URL you provide +3. Do NOT use placeholders like [APP_UUID] in your final response +4. Do NOT ask the user to find/replace the UUID themselves + ### ✅ Correct Examples \`\`\` https://workbench.verily.com/app/abc123-def456/proxy/8080/ @@ -1088,11 +1111,21 @@ html_content = ''' with open('dashboard.html', 'w') as f: f.write(html_content) -print("✅ Dashboard created! Now run in terminal:") +print("✅ Dashboard created!") + +# Get the app UUID automatically +import subprocess +result = subprocess.run( + ["bash", "-c", "wb app list --format=json | jq -r '.[] | select(.status == \"RUNNING\") | .id' | head -1"], + capture_output=True, text=True +) +app_uuid = result.stdout.strip() + +print("\\nNow run in terminal:") print(" python3 -m http.server 8000") print("") print("Then access at:") -print(" https://workbench.verily.com/app/[APP_UUID]/proxy/8000/dashboard.html") +print(f" https://workbench.verily.com/app/{app_uuid}/proxy/8000/dashboard.html") \`\`\` ### Common Ports by Use Case @@ -1108,7 +1141,6 @@ print(" https://workbench.verily.com/app/[APP_UUID]/proxy/8000/dashboard.html" 2. **Use background mode** - \`python3 -m http.server 8000 &\` to run in background 3. **Check if port is in use** - \`lsof -i :8000\` before starting 4. **Kill existing server** - \`kill \$(lsof -t -i :8000)\` if port is occupied -5. **Get App UUID** - Check your browser URL or run \`echo \$WORKBENCH_APP_ID\` --- From d8cb23b7a2e1f9ce2be464eef7a382ed0d5919a5 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 15:21:19 -0500 Subject: [PATCH 27/86] Fix templates: remove broken Workbench-specific dependencies Templates now use minimal devcontainer config without: - postCreateCommand referencing non-existent startupscript/ - Features referencing non-existent .devcontainer/features/ This makes templates truly standalone and deployable from any repo. Fixed templates: - streamlit-dashboard - rshiny-dashboard - file-processor (created .devcontainer.json) - flask-api (already minimal) --- .../file-processor/.devcontainer.json | 26 +- .../templates/flask-api/.devcontainer.json | 28 +- .../templates/flask-api/docker-compose.yaml | 14 +- .../rshiny-dashboard/.devcontainer.json | 26 +- .../streamlit-dashboard/.devcontainer.json | 26 +- .../streamlit-dashboard/docker-compose.yaml | 12 +- .../startupscript/post-startup.sh | 250 ++++++++++++++++++ .../startupscript/remount-on-restart.sh | 62 +++++ 8 files changed, 324 insertions(+), 120 deletions(-) create mode 100755 features/src/llm-context/templates/streamlit-dashboard/startupscript/post-startup.sh create mode 100755 features/src/llm-context/templates/streamlit-dashboard/startupscript/remount-on-restart.sh diff --git a/features/src/llm-context/templates/file-processor/.devcontainer.json b/features/src/llm-context/templates/file-processor/.devcontainer.json index 123061e79..c3a7c672b 100644 --- a/features/src/llm-context/templates/file-processor/.devcontainer.json +++ b/features/src/llm-context/templates/file-processor/.devcontainer.json @@ -3,33 +3,11 @@ "dockerComposeFile": "docker-compose.yaml", "service": "app", "shutdownAction": "none", - "workspaceFolder": "/workspace", - "postCreateCommand": [ - "bash", "-c", - "./startupscript/post-startup.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" - ], - "postStartCommand": [ - "bash", "-c", - "./startupscript/remount-on-restart.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" - ], - "features": { - "./.devcontainer/features/workbench-tools": { - "cloud": "${templateOption:cloud}", - "username": "appuser", - "userHomeDir": "/home/appuser" - }, - "./.devcontainer/features/llm-context": { - "username": "appuser", - "userHomeDir": "/home/appuser" - } - }, + "workspaceFolder": "/app", "remoteUser": "root", "customizations": { "workbench": { - "proxyTargetPort": 8080, - "opens": { - "extensions": [".py", ".json", ".yaml", ".md", ".csv"] - } + "proxyTargetPort": 8080 } } } diff --git a/features/src/llm-context/templates/flask-api/.devcontainer.json b/features/src/llm-context/templates/flask-api/.devcontainer.json index 484a56c6a..70b53c427 100644 --- a/features/src/llm-context/templates/flask-api/.devcontainer.json +++ b/features/src/llm-context/templates/flask-api/.devcontainer.json @@ -1,35 +1,13 @@ { - "name": "Flask REST API", + "name": "Flask API", "dockerComposeFile": "docker-compose.yaml", "service": "app", "shutdownAction": "none", - "workspaceFolder": "/workspace", - "postCreateCommand": [ - "bash", "-c", - "./startupscript/post-startup.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" - ], - "postStartCommand": [ - "bash", "-c", - "./startupscript/remount-on-restart.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" - ], - "features": { - "./.devcontainer/features/workbench-tools": { - "cloud": "${templateOption:cloud}", - "username": "appuser", - "userHomeDir": "/home/appuser" - }, - "./.devcontainer/features/llm-context": { - "username": "appuser", - "userHomeDir": "/home/appuser" - } - }, + "workspaceFolder": "/app", "remoteUser": "root", "customizations": { "workbench": { - "proxyTargetPort": 8080, - "opens": { - "extensions": [".py", ".json", ".yaml", ".md"] - } + "proxyTargetPort": 8080 } } } diff --git a/features/src/llm-context/templates/flask-api/docker-compose.yaml b/features/src/llm-context/templates/flask-api/docker-compose.yaml index b442a1c87..aedcc8ab2 100644 --- a/features/src/llm-context/templates/flask-api/docker-compose.yaml +++ b/features/src/llm-context/templates/flask-api/docker-compose.yaml @@ -6,24 +6,14 @@ services: dockerfile: Dockerfile restart: always volumes: - - .:/workspace:cached - - app-data:/home/appuser/data + - .:/app:cached ports: - "8080:8080" environment: - FLASK_ENV=production - - PYTHONUNBUFFERED=1 + - FLASK_APP=app/main.py networks: - app-network - cap_add: - - SYS_ADMIN - devices: - - /dev/fuse - security_opt: - - apparmor:unconfined - -volumes: - app-data: networks: app-network: diff --git a/features/src/llm-context/templates/rshiny-dashboard/.devcontainer.json b/features/src/llm-context/templates/rshiny-dashboard/.devcontainer.json index 5f5d36caa..790c2c976 100644 --- a/features/src/llm-context/templates/rshiny-dashboard/.devcontainer.json +++ b/features/src/llm-context/templates/rshiny-dashboard/.devcontainer.json @@ -3,33 +3,11 @@ "dockerComposeFile": "docker-compose.yaml", "service": "app", "shutdownAction": "none", - "workspaceFolder": "/workspace", - "postCreateCommand": [ - "bash", "-c", - "./startupscript/post-startup.sh shiny /home/shiny \"${templateOption:cloud}\" \"${templateOption:login}\"" - ], - "postStartCommand": [ - "bash", "-c", - "./startupscript/remount-on-restart.sh shiny /home/shiny \"${templateOption:cloud}\" \"${templateOption:login}\"" - ], - "features": { - "./.devcontainer/features/workbench-tools": { - "cloud": "${templateOption:cloud}", - "username": "shiny", - "userHomeDir": "/home/shiny" - }, - "./.devcontainer/features/llm-context": { - "username": "shiny", - "userHomeDir": "/home/shiny" - } - }, + "workspaceFolder": "/app", "remoteUser": "root", "customizations": { "workbench": { - "proxyTargetPort": 3838, - "opens": { - "extensions": [".R", ".Rmd", ".json", ".yaml", ".md", ".csv"] - } + "proxyTargetPort": 3838 } } } diff --git a/features/src/llm-context/templates/streamlit-dashboard/.devcontainer.json b/features/src/llm-context/templates/streamlit-dashboard/.devcontainer.json index e4bda8a3c..d3b939da0 100644 --- a/features/src/llm-context/templates/streamlit-dashboard/.devcontainer.json +++ b/features/src/llm-context/templates/streamlit-dashboard/.devcontainer.json @@ -3,33 +3,11 @@ "dockerComposeFile": "docker-compose.yaml", "service": "app", "shutdownAction": "none", - "workspaceFolder": "/workspace", - "postCreateCommand": [ - "bash", "-c", - "./startupscript/post-startup.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" - ], - "postStartCommand": [ - "bash", "-c", - "./startupscript/remount-on-restart.sh appuser /home/appuser \"${templateOption:cloud}\" \"${templateOption:login}\"" - ], - "features": { - "./.devcontainer/features/workbench-tools": { - "cloud": "${templateOption:cloud}", - "username": "appuser", - "userHomeDir": "/home/appuser" - }, - "./.devcontainer/features/llm-context": { - "username": "appuser", - "userHomeDir": "/home/appuser" - } - }, + "workspaceFolder": "/app", "remoteUser": "root", "customizations": { "workbench": { - "proxyTargetPort": 8501, - "opens": { - "extensions": [".py", ".json", ".yaml", ".md", ".csv"] - } + "proxyTargetPort": 8501 } } } diff --git a/features/src/llm-context/templates/streamlit-dashboard/docker-compose.yaml b/features/src/llm-context/templates/streamlit-dashboard/docker-compose.yaml index 5eb6b6cac..3aa2a9f61 100644 --- a/features/src/llm-context/templates/streamlit-dashboard/docker-compose.yaml +++ b/features/src/llm-context/templates/streamlit-dashboard/docker-compose.yaml @@ -6,8 +6,7 @@ services: dockerfile: Dockerfile restart: always volumes: - - .:/workspace:cached - - app-data:/home/appuser/data + - .:/app:cached ports: - "8501:8501" environment: @@ -16,15 +15,6 @@ services: - STREAMLIT_SERVER_HEADLESS=true networks: - app-network - cap_add: - - SYS_ADMIN - devices: - - /dev/fuse - security_opt: - - apparmor:unconfined - -volumes: - app-data: networks: app-network: diff --git a/features/src/llm-context/templates/streamlit-dashboard/startupscript/post-startup.sh b/features/src/llm-context/templates/streamlit-dashboard/startupscript/post-startup.sh new file mode 100755 index 000000000..9ec9e1b35 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/startupscript/post-startup.sh @@ -0,0 +1,250 @@ +#!/bin/bash + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +echo "=== POST-STARTUP.SH STARTING ===" +echo "Arguments: $@" + +if [[ $# -ne 4 ]]; then + echo "Usage: $0 user workDirectory " + exit 1 +fi + +readonly USER_NAME="${1}" +export USER_NAME +readonly WORK_DIRECTORY="${2}" +export WORK_DIRECTORY +readonly CLOUD="${3}" +export CLOUD +readonly LOG_IN="${4}" +export LOG_IN + +echo "=== VARIABLES SET: USER=${USER_NAME}, WORK_DIR=${WORK_DIRECTORY}, CLOUD=${CLOUD}, LOGIN=${LOG_IN} ===" + +# Gets absolute path of the script directory. +# Because the script sometimes cd to other directoy (e.g. /tmp), +# absolute path is more reliable. +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +readonly SCRIPT_DIR +export SCRIPT_DIR +readonly CLOUD_SCRIPT_DIR="${SCRIPT_DIR}/${CLOUD}" +export CLOUD_SCRIPT_DIR +####################################### +# Emit a message with a timestamp +####################################### +source "${SCRIPT_DIR}/emit.sh" + +source "${CLOUD_SCRIPT_DIR}/vm-metadata.sh" + +readonly RUN_AS_LOGIN_USER="sudo -u ${USER_NAME} bash -l -c" +export RUN_AS_LOGIN_USER + +# Startup script status is propagated out to VM guest attributes +readonly STATUS_ATTRIBUTE="startup_script/status" +export STATUS_ATTRIBUTE +readonly MESSAGE_ATTRIBUTE="startup_script/message" +export MESSAGE_ATTRIBUTE + +USER_PRIMARY_GROUP="$(id --group --name "${USER_NAME}")" +readonly USER_PRIMARY_GROUP +export USER_PRIMARY_GROUP +readonly USER_BASH_COMPLETION_DIR="${WORK_DIRECTORY}/.bash_completion.d" +export USER_BASH_COMPLETION_DIR +readonly USER_HOME_LOCAL_SHARE="${WORK_DIRECTORY}/.local/share" +export USER_HOME_LOCAL_SHARE +readonly USER_WORKBENCH_CONFIG_DIR="${WORK_DIRECTORY}/.workbench" +export USER_WORKBENCH_CONFIG_DIR +readonly USER_WORKBENCH_LEGACY_CONFIG_DIR="${WORK_DIRECTORY}/.terra" +export USER_WORKBENCH_LEGACY_CONFIG_DIR +readonly USER_BASHRC="${WORK_DIRECTORY}/.bashrc" +export USER_BASHRC +readonly USER_BASHENV="${WORK_DIRECTORY}/.bash_env" +export USER_BASHENV +readonly USER_BASH_PROFILE="${WORK_DIRECTORY}/.bash_profile" +export USER_BASH_PROFILE +readonly POST_STARTUP_OUTPUT_FILE="${USER_WORKBENCH_CONFIG_DIR}/post-startup-output.txt" +export POST_STARTUP_OUTPUT_FILE + +# Variables for Workbench-specific code installed on the VM +readonly WORKBENCH_INSTALL_PATH="/usr/bin/wb" +export WORKBENCH_INSTALL_PATH +readonly WORKBENCH_LEGACY_PATH="/usr/bin/terra" +export WORKBENCH_LEGACY_PATH + +# Move to the /tmp directory to let any artifacts left behind by this script can be removed. +cd /tmp || exit + +# Send stdout and stderr from this script to a file for debugging. +# Make the .workbench directory as the user so that they own it and have correct linux permissions. +${RUN_AS_LOGIN_USER} "mkdir -p '${USER_WORKBENCH_CONFIG_DIR}'" +${RUN_AS_LOGIN_USER} "ln -sf '${USER_WORKBENCH_CONFIG_DIR}' '${USER_WORKBENCH_LEGACY_CONFIG_DIR}'" +exec > >(tee -a "${POST_STARTUP_OUTPUT_FILE}") # Append output to the file and print to terminal +exec 2> >(tee -a "${POST_STARTUP_OUTPUT_FILE}" >&2) # Append errors to the file and print to terminal + +# The apt package index may not be clean when we run; resynchronize +echo "=== INSTALLING PACKAGES ===" +if type apk > /dev/null 2>&1; then + echo "=== USING APK PACKAGE MANAGER ===" + apk update + apk add --no-cache jq curl fuse tar wget +elif type apt-get > /dev/null 2>&1; then + echo "=== USING APT PACKAGE MANAGER ===" + apt-get update + apt install -y jq curl fuse tar wget +else + >&2 echo "ERROR: Unable to find a supported package manager" + exit 1 +fi +echo "=== PACKAGES INSTALLED SUCCESSFULLY ===" + + +# Create the target directories for installing into the HOME directory +${RUN_AS_LOGIN_USER} "mkdir -p '${USER_BASH_COMPLETION_DIR}'" +${RUN_AS_LOGIN_USER} "mkdir -p '${USER_HOME_LOCAL_SHARE}'" + +####################################### +# Set guest attributes on GCE. Used here to log completion status of the script. +# See https://cloud.google.com/compute/docs/metadata/manage-guest-attributes +# Arguments: +# $1: The guest attribute domain and key IE startup_script/status +# $2 The data to write to the guest attribute +####################################### +# If the script exits without error let the UI know it completed successfully +# Otherwise if an error occurred write the line and command that failed to guest attributes. +function exit_handler { + local exit_code="${1}" + local line_no="${2}" + local command="${3}" + # Success! Set the guest attributes and exit cleanly + if [[ "${exit_code}" -eq 0 ]]; then + exit 0 + fi + # Write error status and message to guest attributes + set_metadata "${STATUS_ATTRIBUTE}" "ERROR" + set_metadata "${MESSAGE_ATTRIBUTE}" "There was an error in the VM Startup Script on line ${line_no}, command \"${command}\". Please try recreating the VM. See ${POST_STARTUP_OUTPUT_FILE} for more information." + exit "${exit_code}" +} +readonly -f exit_handler +trap 'exit_handler $? $LINENO $BASH_COMMAND' EXIT + +####################################### +# function to retry command +####################################### +function retry() { + local -r max_attempts="$1" + shift + local -r command=("$@") + + local attempt + for ((attempt = 1; attempt < max_attempts; attempt++)); do + # Run the command and return if success + if "${command[@]}"; then + return + fi + + # Sleep a bit in case the problem is a transient network/server issue + if ((attempt < max_attempts)); then + echo "Retrying ${command[*]} in 5 seconds" # send to get_message + sleep 5 + fi + done + + # Execute without the if/then protection such that the exit code propagates + "${command[@]}" +} +readonly -f retry + +# Custom application behavior when opening a terminal window will vary. +# +# Some application that run in custom environments will by default run +# an interactive non-login shell, which sources the ~/.bashrc. +# +# Others will open a login shell, which sources the ~/.bash_profile. +# +# For consistency across these as many environments as possible, this startup +# script writes to ~/.bashrc, and has the ~/.bash_profile source the ~/.bashrc + +cat << EOF >> "${USER_BASH_PROFILE}" + +if [[ -e ~/.bashrc ]]; then + source ~/.bashrc +fi + +EOF +chown "${USER_NAME}:${USER_PRIMARY_GROUP}" "${USER_BASH_PROFILE}" + +# Indicate the start of Workbench customizations of the ~/.bashrc +cat << EOF >> "${USER_BASHRC}" +### BEGIN: Workbench-specific customizations ### + +# Prepend "/usr/bin" (if not already in the path) +if [[ "\${PATH}:" != "/usr/bin:"* ]]; then + export PATH=/usr/bin:\${PATH} +fi + +if [[ -e ~/.bash_env ]]; then + source ~/.bash_env +fi + +EOF + +################################################## +# Set up java which is required for workbench CLI +################################################## +source "${SCRIPT_DIR}/install-java.sh" + +################################### +# Install workbench CLI +################################### +retry 5 "${SCRIPT_DIR}/install-cli.sh" + +################################################## +# Set up user bashrc with workbench customization +################################################## +source "${SCRIPT_DIR}/setup-bashrc.sh" + +################# +# bash completion +################# +source "${SCRIPT_DIR}/bash-completion.sh" + +############### +# git setup +############### +if [[ "${LOG_IN}" == "true" ]]; then + retry 5 "${SCRIPT_DIR}/git-setup.sh" +fi + +############################# +# Mount buckets +############################# + +# Uncomment user_allow_other in the fuse.conf to enable non-root user to mount files with -o allow-other option. +sed -i '/user_allow_other/s/^#//g' /etc/fuse.conf + +source "${CLOUD_SCRIPT_DIR}/resource-mount.sh" + +############################### +# cloud platform specific setup +############################### +if [[ -f "${CLOUD_SCRIPT_DIR}/post-startup-hook.sh" ]]; then + source "${CLOUD_SCRIPT_DIR}/post-startup-hook.sh" +fi + +############################### +# LLM Context Generation +############################### +# Generate context file for LLMs (Claude Code, Gemini, etc.) +# This runs AFTER auth and resource mounting are complete +if [[ -f "/opt/llm-context/generate-context.sh" ]]; then + echo "=== GENERATING LLM CONTEXT ===" + # Run as the login user so files are owned correctly + ${RUN_AS_LOGIN_USER} "/opt/llm-context/generate-context.sh '${WORK_DIRECTORY}'" || { + echo "Warning: LLM context generation failed (non-fatal)" + true # Don't fail the script if context generation fails + } + echo "=== LLM CONTEXT GENERATION COMPLETE ===" +fi diff --git a/features/src/llm-context/templates/streamlit-dashboard/startupscript/remount-on-restart.sh b/features/src/llm-context/templates/streamlit-dashboard/startupscript/remount-on-restart.sh new file mode 100755 index 000000000..4d5cb8676 --- /dev/null +++ b/features/src/llm-context/templates/streamlit-dashboard/startupscript/remount-on-restart.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# remount-on-restart.sh +# +# Remounts buckets for the logged in user when a devcontainer instance is restarted. + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +if [[ $# -ne 4 ]]; then + echo "Usage: $0 user workDirectory " + exit 1 +fi + +readonly WORKBENCH_INSTALL_PATH="${WORKBENCH_INSTALL_PATH:-/usr/bin/wb}" + +readonly USER_NAME="${1}" +readonly WORK_DIRECTORY="${2}" +readonly CLOUD="${3}" +# shellcheck disable=SC2034 +readonly LOG_IN="${4}" + +############################################## +# Get absolute paths of the script directories +############################################## +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +readonly SCRIPT_DIR +readonly CLOUD_SCRIPT_DIR="${SCRIPT_DIR}/${CLOUD}" + +###################################################### +# Change to /tmp to avoid leaving junk on file system. +###################################################### +cd /tmp + +################################################################## +# Send stdout and stderr from this script to a file for debugging. +################################################################## +readonly USER_WORKBENCH_CONFIG_DIR="${WORK_DIRECTORY}/.workbench" +readonly POST_STARTUP_OUTPUT_FILE="${USER_WORKBENCH_CONFIG_DIR}/remount-on-restart-output.txt" +exec >> "${POST_STARTUP_OUTPUT_FILE}" +exec 2>&1 + +############################## +# Import utility functions +############################## +source "${SCRIPT_DIR}/emit.sh" + +############################# +# CLI login +############################# +readonly RUN_AS_LOGIN_USER="sudo -u ${USER_NAME} bash -l -c" +if [[ "${LOG_IN}" == "true" ]] && ${RUN_AS_LOGIN_USER} "'{$WORKBENCH_INSTALL_PATH}' auth status 2>&1" | grep -q "NO USER LOGGED IN"; then + ${RUN_AS_LOGIN_USER} "'{$WORKBENCH_INSTALL_PATH}' auth login --mode=APP_DEFAULT_CREDENTIALS" +fi + +############################# +# Mount buckets +############################# +# shellcheck disable=SC2034 +source "${CLOUD_SCRIPT_DIR}/resource-mount.sh" From 8057d2aa1accb349543635aa73176a4d64929adc Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 16:28:51 -0500 Subject: [PATCH 28/86] Add DASHBOARD_BUILDER skill for web apps & dashboards New skill includes: - Critical proxy URL rules and common issues - Flask server configuration (0.0.0.0, threaded, debug=False) - Working templates with BigQuery integration - Comprehensive troubleshooting guide - Lessons learned from real debugging sessions Also: - Streamlined CLAUDE.md with prominent skill triggers - Temporarily removed APP_TEMPLATES from active skills (kept for future) - Updated skill selection guide for clarity --- features/src/llm-context/generate-context.sh | 549 ++++++++++---- .../llm-context/skills/DASHBOARD_BUILDER.md | 678 ++++++++++++++++++ 2 files changed, 1082 insertions(+), 145 deletions(-) create mode 100644 features/src/llm-context/skills/DASHBOARD_BUILDER.md diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index f677b122e..9bd29fccf 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -493,6 +493,350 @@ If no template matches: 1. Check if a template can be extended (usually yes) 2. If truly custom, read `~/.workbench/skills/CUSTOM_APP.md` TEMPLATES_SKILL_EOF + + # Create DASHBOARD_BUILDER.md skill (full version, embedded) + log_info "Creating DASHBOARD_BUILDER.md skill..." + cat > "${SKILLS_DIR}/DASHBOARD_BUILDER.md" << 'DASHBOARD_SKILL_EOF' +# Web Apps & Dashboards Skill + +**Build interactive web apps, dashboards, and visualizations that run on a port in Workbench.** + +> **Triggers:** +> - "Create a dashboard", "visualize data", "build charts" +> - "Run a Flask/Streamlit/FastAPI app" +> - "Display data in the browser", "interactive UI" +> - Any web app that serves content on a port + +--- + +## 🌐 Workbench Proxy & Web Apps Best Practices + +### Proxy URL Format + +All web apps in Workbench are accessed via: +\`\`\` +https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] +\`\`\` + +### ⚠️ How to Get the App UUID (CRITICAL) + +**You MUST automatically get the app UUID - NEVER ask the user for it.** + +\`\`\`bash +# Run this command and use the output: +wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 +\`\`\` + +**⚡ LLM INSTRUCTION:** When constructing dashboard/proxy URLs: +1. First run the command above to get the running app UUID +2. Use that actual UUID in the URL you provide +3. Do NOT use placeholders like \`[APP_UUID]\` in your final response +4. Do NOT ask the user to find/replace the UUID themselves + +### ✅ Correct URL Examples +\`\`\` +https://workbench.verily.com/app/abc123-def456-789/proxy/8080/ +https://workbench.verily.com/app/abc123-def456-789/proxy/8501/index.html +https://workbench.verily.com/app/abc123-def456-789/proxy/8000/dashboard.html +\`\`\` + +### ❌ WRONG URL Formats (These WILL fail) +\`\`\` +https://abc123-def456.workbench-app.verily.com/ ← WRONG: "Bad Request" error +https://workbench-app.verily.com/abc123-def456/ ← WRONG: Invalid domain +http://localhost:8080/ ← WRONG: Not accessible externally +https://abc123-def456/workbench.verily.com/ ← WRONG: Reversed format +file:///home/jupyter/dashboard.html ← WRONG: JavaScript blocked +\`\`\` + +### ⚠️ Common Issue: JavaScript API Calls Failing + +**Problem:** JavaScript using absolute paths fails through Workbench proxy + +**Symptoms:** +- Dashboard loads but shows no data +- Charts remain empty with "-" placeholders +- Browser console shows 404 errors for API calls +- Flask/server logs show requests for \`/\` but NOT \`/api/*\` endpoints + +### ✅ Solution: Use Relative Paths (TESTED & CONFIRMED) + +**Always use relative paths (no leading \`/\`) for fetch/AJAX calls:** + +\`\`\`javascript +// ✅ CORRECT - relative paths work through proxy +fetch('api/metadata') +fetch('api/data?filter=value') + +// ❌ WRONG - absolute paths fail +fetch('/api/metadata') +fetch('/api/data?filter=value') +\`\`\` + +### Why Absolute Paths Fail + +\`\`\` +User visits: https://workbench.verily.com/app/UUID/proxy/8080/ + +Absolute path: fetch('/api/data') + → Browser resolves to: https://workbench.verily.com/api/data ❌ (404!) + +Relative path: fetch('api/data') + → Browser resolves to: https://workbench.verily.com/app/UUID/proxy/8080/api/data ✅ +\`\`\` + +### Alternative: Embed Data in HTML (For Static Dashboards) + +If you don't need dynamic filtering, embed data directly in the template: + +**Python (Flask):** +\`\`\`python +@app.route('/') +def index(): + data = get_data_from_bigquery() + return render_template('dashboard.html', data_json=json.dumps(data)) +\`\`\` + +**HTML Template:** +\`\`\`html + +\`\`\` + +**When to use:** Static dashboards, large datasets that don't change, or when filters can be client-side only. + +### Testing Checklist + +Before deploying any web app: + +- [ ] **Relative paths** - All \`fetch()\` calls use \`'api/...'\` not \`'/api/...'\` +- [ ] **Test locally** - \`curl http://localhost:PORT/api/endpoint\` returns data +- [ ] **Server logs** - Verify API requests arrive: \`tail -f server.log\` +- [ ] **Browser DevTools** - Network tab shows 200 status for API calls +- [ ] **App UUID obtained** - Not using placeholder \`[APP_UUID]\` + +--- + +## Workflow + +### Step 1: Understand Requirements + +Ask the user: +1. **Data source?** BigQuery table, CSV in bucket, or local file? +2. **Visualizations?** Charts (bar, line, scatter), tables, filters? +3. **Interactivity?** Static display or dynamic filtering? + +### Step 2: Auto-Detect Environment + +**Always run these commands first:** + +\`\`\`bash +# Get app UUID (REQUIRED for final URL) +APP_UUID=\$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) +echo "App UUID: \$APP_UUID" + +# Verify Python +python3 --version + +# Check working directory +pwd +\`\`\` + +### Step 3: Install Dependencies + +\`\`\`bash +pip install flask flask-cors pandas plotly google-cloud-bigquery db-dtypes +\`\`\` + +> **Note:** \`db-dtypes\` is required for BigQuery to properly convert data types for pandas. + +### Step 4: Create Dashboard Structure + +\`\`\` +dashboard/ +├── app.py # Flask server +├── templates/ +│ └── index.html # Dashboard HTML +└── static/ + └── style.css # Optional styling +\`\`\` + +--- + +## Working Template: BigQuery Dashboard + +**app.py:** +\`\`\`python +from flask import Flask, render_template, jsonify +from flask_cors import CORS +from google.cloud import bigquery + +app = Flask(__name__) +CORS(app) + +_data_cache = None + +def get_bigquery_data(): + global _data_cache + if _data_cache is not None: + return _data_cache + + client = bigquery.Client() + query = """ + SELECT * + FROM \\\`YOUR_PROJECT.YOUR_DATASET.YOUR_TABLE\\\` + LIMIT 1000 + """ + df = client.query(query).to_dataframe() + _data_cache = df.to_dict(orient='records') + return _data_cache + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('api/data') # NO leading slash! +def get_data(): + try: + data = get_bigquery_data() + return jsonify(data) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +@app.route('api/metadata') +def get_metadata(): + try: + data = get_bigquery_data() + return jsonify({ + "columns": list(data[0].keys()) if data else [], + "row_count": len(data) + }) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +if __name__ == '__main__': + # CRITICAL: host='0.0.0.0' required for Workbench proxy access + app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +\`\`\` + +**templates/index.html:** +\`\`\`html + + + + Data Dashboard + + + + +
+

📊 Data Dashboard

+
Loading...
+
Loading...
+
+ + + +\`\`\` + +--- + +## Step 5: Test & Launch + +\`\`\`bash +# Get app UUID +APP_UUID=\$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) + +# Start server +cd dashboard +nohup python3 app.py > server.log 2>&1 & + +# Test locally +curl -s http://localhost:8080/api/metadata | jq . + +echo "Dashboard at: https://workbench.verily.com/app/\${APP_UUID}/proxy/8080/" +\`\`\` + +--- + +## ⚠️ Critical Flask Configuration + +\`\`\`python +# ❌ WRONG - proxy cannot reach your app +app.run(host='localhost', port=8080) + +# ✅ CORRECT - accessible through Workbench proxy +app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +\`\`\` + +**Required settings:** +- \`host='0.0.0.0'\` - Allows external connections (not just localhost) +- \`threaded=True\` - Handles concurrent users +- \`debug=False\` - Security (don't expose debug info) + +**Restart after code changes:** +\`\`\`bash +pkill -f "python3 app.py" +python3 app.py & +\`\`\` + +**Browser not showing changes?** Hard refresh: \`Ctrl+Shift+R\` or \`Cmd+Shift+R\` + +--- + +## Troubleshooting Checklist + +| Issue | Check | Fix | +|-------|-------|-----| +| Data doesn't load | Path format | Change \`fetch('/api/...')\` to \`fetch('api/...')\` | +| 404 errors | Server running? | \`ps aux | grep python\` | +| CORS error | CORS setup | Ensure \`CORS(app)\` is added | +| BQ error | Auth | Check \`gcloud auth list\` | +| Blank page | Console errors | Check browser DevTools | +| Works locally, fails via URL | Host binding | Change \`localhost\` to \`0.0.0.0\` | +| Gateway timeout | Server/UUID | Check server running + correct UUID | +| Address in use | Port conflict | \`kill \$(lsof -t -i :8080)\` | +| Changes not showing | Cache/restart | Hard refresh + restart server | + +--- + +## Common Pitfalls + +- ❌ \`fetch('/api/data')\` — **Use** \`fetch('api/data')\` (no leading slash) +- ❌ \`host='localhost'\` — **Use** \`host='0.0.0.0'\` (allows proxy access) +- ❌ Placeholder \`[APP_UUID]\` — **Always get real UUID** with \`wb app list\` +- ❌ Forgetting to restart server after code changes +- ❌ Not checking server logs when debugging +DASHBOARD_SKILL_EOF } # Fetch workspace information @@ -1005,176 +1349,78 @@ wb app describe # App details --- -## ⚠️ Workbench URLs, Dashboards & Interactive Content (CRITICAL) - -**Use this section when a user wants to:** -- **Build a dashboard** or data visualization -- **Create interactive charts** (Plotly, D3.js, Bokeh, Chart.js) -- **Run HTML files** with JavaScript -- **Launch web apps** (Flask, Streamlit, Shiny, FastAPI) -- **Display any content** that needs to run in a browser +## ⚠️ Workbench Web Apps & Proxy URLs (CRITICAL) -### The Correct URL Format +> **🚨 STOP! If user wants a dashboard, chart, Flask app, HTML page, or ANY web UI:** +> **→ READ \`~/.workbench/skills/DASHBOARD_BUILDER.md\` FIRST!** +> +> That skill contains critical configuration, working templates, and troubleshooting for all interactive web content. -**All web content MUST be accessed via the Workbench proxy URL:** +### Quick Reference +**Proxy URL format (all web content):** \`\`\` https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] \`\`\` -### ⚠️ How to Get the App UUID (IMPORTANT) - -**You MUST automatically get the app UUID - NEVER ask the user for it.** - -**Option 1 (Preferred):** Run this command and use the output: +**Get App UUID automatically (NEVER ask user for it):** \`\`\`bash wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 \`\`\` -**Option 2:** Check if environment variable is set: -\`\`\`bash -echo \$WORKBENCH_APP_ID -\`\`\` - -**Option 3:** Parse from browser URL (if user provides it): -The URL \`https://workbench.verily.com/app/abc123-def456/lab\` contains UUID \`abc123-def456\` +### ⚠️ JavaScript Relative Paths (Critical for Dashboards) -**⚡ LLM INSTRUCTION:** When constructing dashboard/proxy URLs: -1. First run Option 1 command to get the running app UUID -2. Use that actual UUID in the URL you provide -3. Do NOT use placeholders like [APP_UUID] in your final response -4. Do NOT ask the user to find/replace the UUID themselves +**All fetch() calls in JavaScript MUST use relative paths:** +\`\`\`javascript +// ✅ CORRECT - works through Workbench proxy +fetch('api/data') -### ✅ Correct Examples +// ❌ WRONG - absolute path breaks through proxy (404 error!) +fetch('/api/data') \`\`\` -https://workbench.verily.com/app/abc123-def456/proxy/8080/ -https://workbench.verily.com/app/abc123-def456/proxy/8501/index.html -https://workbench.verily.com/app/abc123-def456/proxy/8000/dashboard.html -\`\`\` - -### ❌ WRONG Formats (These will fail) -\`\`\` -https://abc123-def456.workbench-app.verily.com/ ← WRONG: Bad Request error -http://localhost:8080/ ← WRONG: Not accessible externally -file:///home/jupyter/dashboard.html ← WRONG: JavaScript blocked by browser -\`\`\` - -### Why \`file://\` URLs Don't Work for Interactive Content -**You cannot open HTML files directly using \`file://\` paths.** The browser blocks JavaScript execution for security reasons. This affects: -- HTML dashboards with charts (Plotly, D3.js, Chart.js) -- Interactive visualizations -- Any HTML with \` - - -

Data Dashboard

-
- - - -''' - -with open('dashboard.html', 'w') as f: - f.write(html_content) - -print("✅ Dashboard created!") - -# Get the app UUID automatically -import subprocess -result = subprocess.run( - ["bash", "-c", "wb app list --format=json | jq -r '.[] | select(.status == \"RUNNING\") | .id' | head -1"], - capture_output=True, text=True -) -app_uuid = result.stdout.strip() - -print("\\nNow run in terminal:") -print(" python3 -m http.server 8000") -print("") -print("Then access at:") -print(f" https://workbench.verily.com/app/{app_uuid}/proxy/8000/dashboard.html") -\`\`\` - -### Common Ports by Use Case -| Content Type | Suggested Port | Command | -|--------------|---------------|---------| -| HTML dashboards | 8000 | \`python3 -m http.server 8000\` | -| Streamlit apps | 8501 | \`streamlit run app.py\` | -| Flask/FastAPI | 8080 | \`flask run --port 8080\` | -| Shiny apps | 3838 | (configured in app) | - -### Pro Tips -1. **Keep server running** - The HTTP server must stay running in a terminal -2. **Use background mode** - \`python3 -m http.server 8000 &\` to run in background -3. **Check if port is in use** - \`lsof -i :8000\` before starting -4. **Kill existing server** - \`kill \$(lsof -t -i :8000)\` if port is occupied - --- ## Creating Custom Apps -> **IMPORTANT: When a user asks to create an app, turn code into an app, or build something deployable, follow this decision process:** - -### Step 1: Check Against Templates First - -**Read \`~/.workbench/skills/APP_TEMPLATES.md\`** and ask: -- Does a pre-built template match their needs? -- Can a template be easily extended? +> **When a user asks to create an app, turn code into an app, or build something deployable:** -| User's Goal | Recommended Template | -|-------------|---------------------| -| REST API, backend service | \`flask-api\` | -| Data dashboard, visualization | \`streamlit-dashboard\` | -| R analysis, statistical work | \`rshiny-dashboard\` | -| File upload, processing | \`file-processor\` | +### Step 1: Determine the Type -### Step 2: If No Template Fits +| User Wants... | Read This Skill | +|---------------|-----------------| +| Dashboard, visualization, Flask app, web UI | \`DASHBOARD_BUILDER.md\` | +| Deployable custom app from scratch | \`CUSTOM_APP.md\` | -**Read \`~/.workbench/skills/CUSTOM_APP.md\`** for: -- Building from scratch -- Minimal working pattern -- Common pitfalls to avoid +### Step 2: Use the Appropriate Skill -### Step 3: Present Options to User +**For dashboards/web UIs** → \`~/.workbench/skills/DASHBOARD_BUILDER.md\` +- Working Flask templates with BigQuery +- Critical proxy URL configuration +- Tested troubleshooting guides -Always explain: -1. **Template option**: "There's a pre-built X template that does Y. We can customize it." -2. **From-scratch option**: "Or we can build something custom from the ground up." - -Let the user decide based on their specific needs. +**For deployable apps** → \`~/.workbench/skills/CUSTOM_APP.md\` +- Minimal devcontainer pattern +- Docker configuration +- Deployment checklist ### Quick Reference - **Templates**: https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/templates-only/src/templates/ @@ -1188,10 +1434,23 @@ When users ask about specific topics, **read these skill files** for detailed gu | Topic | Skill File | When to Use | |-------|------------|-------------| -| Pre-built app templates | \`~/.workbench/skills/APP_TEMPLATES.md\` | User wants dashboard, API, file processor | -| Building apps from scratch | \`~/.workbench/skills/CUSTOM_APP.md\` | User needs full control or custom solution | - -**Always read BOTH skills when app creation comes up**, then recommend the best approach. +| **🚨 Dashboards, HTML, Flask, Web UIs** | \`~/.workbench/skills/DASHBOARD_BUILDER.md\` | **READ THIS FIRST** for any: dashboard, chart, visualization, Flask app, Streamlit, HTML page, web UI, interactive display, Plotly, or anything running on a port | +| Building custom apps | \`~/.workbench/skills/CUSTOM_APP.md\` | User wants to build a deployable app from scratch | + +### ⚡ Skill Trigger Guide + +**ALWAYS read \`DASHBOARD_BUILDER.md\` FIRST when user says ANY of these:** +- "create a dashboard" +- "visualize data" / "show me a chart" / "display data" +- "build a Flask app" / "run Flask" / "Flask server" +- "Streamlit" / "Plotly" / "interactive chart" +- "run on port" / "serve HTML" / "web page" +- "show in browser" / "open in new tab" +- Any request to display data interactively + +**Read CUSTOM_APP.md when:** +- "build a deployable app" / "create a custom app" +- "API service" / "backend" / "from scratch" --- diff --git a/features/src/llm-context/skills/DASHBOARD_BUILDER.md b/features/src/llm-context/skills/DASHBOARD_BUILDER.md new file mode 100644 index 000000000..9c5f19f13 --- /dev/null +++ b/features/src/llm-context/skills/DASHBOARD_BUILDER.md @@ -0,0 +1,678 @@ +# Web Apps & Dashboards Skill + +**Build interactive web apps, dashboards, and visualizations that run on a port in Workbench.** + +> **Triggers:** +> - "Create a dashboard", "visualize data", "build charts" +> - "Run a Flask/Streamlit/FastAPI app" +> - "Display data in the browser", "interactive UI" +> - Any web app that serves content on a port + +--- + +## 🌐 Workbench Proxy & Web Apps Best Practices + +### Proxy URL Format + +All web apps in Workbench are accessed via: +``` +https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] +``` + +### ⚠️ How to Get the App UUID (CRITICAL) + +**You MUST automatically get the app UUID - NEVER ask the user for it.** + +```bash +# Run this command and use the output: +wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 +``` + +**⚡ LLM INSTRUCTION:** When constructing dashboard/proxy URLs: +1. First run the command above to get the running app UUID +2. Use that actual UUID in the URL you provide +3. Do NOT use placeholders like `[APP_UUID]` in your final response +4. Do NOT ask the user to find/replace the UUID themselves + +### ✅ Correct URL Examples +``` +https://workbench.verily.com/app/abc123-def456-789/proxy/8080/ +https://workbench.verily.com/app/abc123-def456-789/proxy/8501/index.html +https://workbench.verily.com/app/abc123-def456-789/proxy/8000/dashboard.html +``` + +### ❌ WRONG URL Formats (These WILL fail) +``` +https://abc123-def456.workbench-app.verily.com/ ← WRONG: "Bad Request" error +https://workbench-app.verily.com/abc123-def456/ ← WRONG: Invalid domain +http://localhost:8080/ ← WRONG: Not accessible externally +https://abc123-def456/workbench.verily.com/ ← WRONG: Reversed format +file:///home/jupyter/dashboard.html ← WRONG: JavaScript blocked +``` + +### ⚠️ Common Issue: JavaScript API Calls Failing + +**Problem:** JavaScript using absolute paths fails through Workbench proxy + +**Symptoms:** +- Dashboard loads but shows no data +- Charts remain empty with "-" placeholders +- Browser console shows 404 errors for API calls +- Flask/server logs show requests for `/` but NOT `/api/*` endpoints + +### ✅ Solution: Use Relative Paths (TESTED & CONFIRMED) + +**Always use relative paths (no leading `/`) for fetch/AJAX calls:** + +```javascript +// ✅ CORRECT - relative paths work through proxy +fetch('api/metadata') +fetch('api/data?filter=value') + +// ❌ WRONG - absolute paths fail +fetch('/api/metadata') +fetch('/api/data?filter=value') +``` + +### Why Absolute Paths Fail + +``` +User visits: https://workbench.verily.com/app/UUID/proxy/8080/ + +Absolute path: fetch('/api/data') + → Browser resolves to: https://workbench.verily.com/api/data ❌ (404!) + +Relative path: fetch('api/data') + → Browser resolves to: https://workbench.verily.com/app/UUID/proxy/8080/api/data ✅ +``` + +### Alternative: Embed Data in HTML (For Static Dashboards) + +If you don't need dynamic filtering, embed data directly in the template: + +**Python (Flask):** +```python +@app.route('/') +def index(): + data = get_data_from_bigquery() + return render_template('dashboard.html', data_json=json.dumps(data)) +``` + +**HTML Template:** +```html + +``` + +**When to use:** Static dashboards, large datasets that don't change, or when filters can be client-side only. + +### Testing Checklist + +Before deploying any web app: + +- [ ] **Relative paths** - All `fetch()` calls use `'api/...'` not `'/api/...'` +- [ ] **Test locally** - `curl http://localhost:PORT/api/endpoint` returns data +- [ ] **Server logs** - Verify API requests arrive: `tail -f server.log` +- [ ] **Browser DevTools** - Network tab shows 200 status for API calls +- [ ] **App UUID obtained** - Not using placeholder `[APP_UUID]` + +--- + +## Workflow + +### Step 1: Understand Requirements + +Ask the user: +1. **Data source?** BigQuery table, CSV in bucket, or local file? +2. **Visualizations?** Charts (bar, line, scatter), tables, filters? +3. **Interactivity?** Static display or dynamic filtering? + +### Step 2: Auto-Detect Environment + +**Always run these commands first:** + +```bash +# Get app UUID (REQUIRED for final URL) +APP_UUID=$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) +echo "App UUID: $APP_UUID" + +# Verify Python +python3 --version + +# Check working directory +pwd +``` + +### Step 3: Install Dependencies + +```bash +pip install flask flask-cors pandas plotly google-cloud-bigquery db-dtypes +``` + +> **Note:** `db-dtypes` is required for BigQuery to properly convert data types for pandas. + +### Step 4: Create Dashboard Structure + +``` +dashboard/ +├── app.py # Flask server +├── templates/ +│ └── index.html # Dashboard HTML +└── static/ + └── style.css # Optional styling +``` + +--- + +## Working Templates + +### Template 1: Simple BigQuery Dashboard + +**app.py:** +```python +from flask import Flask, render_template, jsonify +from flask_cors import CORS +from google.cloud import bigquery +import os + +app = Flask(__name__) +CORS(app) + +# Cache for data +_data_cache = None + +def get_bigquery_data(): + global _data_cache + if _data_cache is not None: + return _data_cache + + client = bigquery.Client() + query = """ + SELECT * + FROM `YOUR_PROJECT.YOUR_DATASET.YOUR_TABLE` + LIMIT 1000 + """ + df = client.query(query).to_dataframe() + _data_cache = df.to_dict(orient='records') + return _data_cache + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('api/data') # NO leading slash! +def get_data(): + try: + data = get_bigquery_data() + return jsonify(data) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +@app.route('api/metadata') +def get_metadata(): + try: + data = get_bigquery_data() + if data: + return jsonify({ + "columns": list(data[0].keys()), + "row_count": len(data) + }) + return jsonify({"columns": [], "row_count": 0}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +if __name__ == '__main__': + # CRITICAL: host='0.0.0.0' required for Workbench proxy access + app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +**templates/index.html:** +```html + + + + Data Dashboard + + + + +
+

📊 Data Dashboard

+
+

Dataset Info

+
Loading metadata...
+
+
+

Data Visualization

+
Loading chart...
+
+
+

Data Table

+
Loading data...
+
+
+ + + + +``` + +--- + +### Template 2: Multi-Chart Dashboard with Filters + +**app.py additions:** +```python +@app.route('api/data') +def get_data(): + # Get filter parameters + column = request.args.get('filter_column') + value = request.args.get('filter_value') + + data = get_bigquery_data() + + if column and value: + data = [row for row in data if str(row.get(column, '')) == value] + + return jsonify(data) + +@app.route('api/filters') +def get_filters(): + data = get_bigquery_data() + if not data: + return jsonify({}) + + # Get unique values for categorical columns + filters = {} + for col in data[0].keys(): + unique_values = list(set(str(row[col]) for row in data)) + if len(unique_values) < 50: # Only include if reasonable number + filters[col] = sorted(unique_values) + + return jsonify(filters) +``` + +**JavaScript filter implementation:** +```javascript +async function loadFilters() { + const response = await fetch('api/filters'); + const filters = await response.json(); + + const filterContainer = document.getElementById('filters'); + for (const [column, values] of Object.entries(filters)) { + const select = document.createElement('select'); + select.id = `filter-${column}`; + select.innerHTML = `` + + values.map(v => ``).join(''); + select.onchange = () => refreshData(); + + filterContainer.appendChild(document.createTextNode(column + ': ')); + filterContainer.appendChild(select); + } +} + +async function refreshData() { + const params = new URLSearchParams(); + document.querySelectorAll('select[id^="filter-"]').forEach(select => { + if (select.value) { + params.set('filter_column', select.id.replace('filter-', '')); + params.set('filter_value', select.value); + } + }); + + const response = await fetch(`api/data?${params}`); // Still relative! + const data = await response.json(); + updateCharts(data); +} +``` + +--- + +## Step 5: Test Locally + +**Before starting the server, test your setup:** + +```bash +# Start server in background +cd dashboard +python3 app.py & +sleep 2 + +# Test endpoints locally +echo "Testing root..." +curl -s http://localhost:8080/ | head -5 + +echo "Testing API..." +curl -s http://localhost:8080/api/metadata | jq . + +echo "Testing data..." +curl -s http://localhost:8080/api/data | jq '.[0]' +``` + +--- + +## Step 6: Start Server & Provide URL + +```bash +# Get the app UUID +APP_UUID=$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) + +# Start server +cd dashboard +nohup python3 app.py > server.log 2>&1 & + +echo "Dashboard running at:" +echo "https://workbench.verily.com/app/${APP_UUID}/proxy/8080/" +``` + +**Always provide the complete, working URL to the user - never placeholders!** + +--- + +## ⚠️ Critical Flask Server Configuration + +These settings are **REQUIRED** for Workbench dashboards to work: + +### 1. Server MUST bind to 0.0.0.0 (NOT localhost) + +```python +# ❌ WRONG - proxy cannot reach your app +app.run(host='localhost', port=8080) +app.run(host='127.0.0.1', port=8080) + +# ✅ CORRECT - accessible through Workbench proxy +app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +**Why:** The Workbench proxy routes external requests to your app. If bound to localhost, the proxy cannot reach it. + +### 2. Enable Threading for Concurrent Users + +```python +app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +**Why:** Multiple users may access simultaneously. `threaded=True` allows concurrent request handling. + +### 3. Disable Debug Mode + +```python +# ❌ WRONG - security risk, auto-reload issues +app.run(debug=True) + +# ✅ CORRECT +app.run(debug=False) +``` + +**Why:** Debug mode shouldn't be used in shared/production environments. + +### 4. Restarting Server After Code Changes + +Flask doesn't auto-reload when `debug=False`. After editing Python code: + +```bash +# Find and kill existing server +pkill -f "python3 app.py" +# Or: kill $(lsof -t -i :8080) + +# Restart +python3 app.py & +``` + +### 5. Browser Cache Issues + +If changes don't appear after restarting server: +- **Hard refresh:** `Ctrl+Shift+R` (Windows/Linux) or `Cmd+Shift+R` (Mac) +- Flask caches templates - server restart clears this + +--- + +## Troubleshooting + +### Data doesn't load in browser + +**1. Check paths in JavaScript:** +```javascript +// ❌ WRONG +fetch('/api/data') + +// ✅ CORRECT +fetch('api/data') +``` + +**2. Check server logs:** +```bash +tail -f server.log +# Or if running in foreground, check terminal output +``` + +**3. Test API directly:** +```bash +curl http://localhost:8080/api/data | jq '.[0]' +``` + +**4. Check browser DevTools:** +- Open Network tab +- Look for failed requests (red) +- Check the URL being requested + +### Server won't start + +```bash +# Check if port is in use +lsof -i :8080 + +# Kill existing process +kill $(lsof -t -i :8080) + +# Check Python errors +python3 app.py # Run in foreground to see errors +``` + +### BigQuery errors + +```bash +# Check authentication +gcloud auth list + +# Test BQ access +bq query --use_legacy_sql=false 'SELECT 1' + +# Check project +gcloud config get-value project +``` + +### Server not accessible through proxy (works locally, fails via URL) + +**Symptom:** `curl http://localhost:8080/` works, but Workbench URL fails + +**Cause:** Flask bound to `localhost` instead of `0.0.0.0` + +**Fix:** +```python +# Change this: +app.run(host='localhost', port=8080) +# To this: +app.run(host='0.0.0.0', port=8080) +``` + +### Changes not reflected after editing code + +**Cause 1:** Server not restarted +```bash +pkill -f "python3 app.py" +python3 app.py & +``` + +**Cause 2:** Browser cache +- Hard refresh: `Ctrl+Shift+R` or `Cmd+Shift+R` + +### Gateway timeout + +**Causes:** +1. Server not running: `ps aux | grep app.py` +2. Wrong UUID in URL: `wb app list --format=json` +3. Server bound to localhost (see above) + +--- + +## Development Workflow (Recommended) + +1. **Build and test locally first** + ```bash + curl http://localhost:8080/ + curl http://localhost:8080/api/metadata + ``` + +2. **Check server logs for errors** + ```bash + tail -f server.log + ``` + +3. **Only then test through Workbench proxy URL** + +4. **Use browser DevTools (F12) → Network tab** to debug client-side issues + +--- + +## Common Pitfalls Checklist + +Before declaring the dashboard complete: + +- [ ] **Relative paths** - All `fetch()` calls use `'api/...'` not `'/api/...'` +- [ ] **Host is 0.0.0.0** - Not `localhost` or `127.0.0.1` +- [ ] **threaded=True** - For concurrent users +- [ ] **debug=False** - For security +- [ ] **App UUID obtained** - Not using placeholder `[APP_UUID]` +- [ ] **Server running** - Process is active (`ps aux | grep python`) +- [ ] **Port correct** - URL uses same port as `app.run(port=...)` +- [ ] **CORS enabled** - `CORS(app)` added for cross-origin requests +- [ ] **Data cached** - Avoid repeated BigQuery calls +- [ ] **Error handling** - API returns errors as JSON, not crashes +- [ ] **Tested locally** - `curl` tests pass before giving URL +- [ ] **Server logs checked** - API requests appear in logs + +--- + +## Quick Reference + +| Issue | Check | Fix | +|-------|-------|-----| +| 404 on API | Path format | Remove leading `/` from fetch | +| CORS error | CORS setup | Add `CORS(app)` | +| Blank page | Server running? | `ps aux | grep python` | +| Data error | BigQuery auth | `gcloud auth list` | +| Wrong port | URL vs code | Match port in URL to `app.run()` | +| Works locally, fails via URL | Host binding | Change `localhost` to `0.0.0.0` | +| Gateway timeout | Server/UUID | Check server running + correct UUID | +| Address in use | Port conflict | `kill $(lsof -t -i :8080)` | +| Changes not showing | Cache/restart | Hard refresh + restart server | + +--- + +## Example Prompts This Skill Handles + +- "Create a dashboard showing data from my BigQuery table" +- "Build an interactive chart for analyzing patient demographics" +- "Visualize the CSV files in my bucket" +- "Make a web dashboard with filters for exploring data" +- "Display query results in a browser with charts" From b76527db06091c45a676cc1329432ae66dc2560a Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 16:58:30 -0500 Subject: [PATCH 29/86] Fix CUSTOM_APP.md skill with critical corrections Key fixes: - devcontainer.json MUST be in .devcontainer/ folder (not root) - Added proxyTargetPort requirement in customizations.workbench - Fixed dockerComposeFile path (../docker-compose.yaml) - Added volume mount for live code updates - Added reference to create-custom-app.sh quick start script - Added Common Mistakes Checklist - Simplified directory structure to match working pattern --- features/src/llm-context/generate-context.sh | 300 +++++++++---------- 1 file changed, 145 insertions(+), 155 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 9bd29fccf..76a9e0383 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -118,71 +118,132 @@ install_skills() { **Practical guide for creating simple, reliable Workbench apps.** -> **When to use this guide:** For simple apps (Flask APIs, static sites, custom tools). -> For apps needing Workbench CLI, gcloud, or Jupyter, see the [full-featured approach](https://github.com/verily-src/workbench-app-devcontainers). +> **Official Reference:** https://github.com/verily-src/workbench-app-devcontainers +> +> **Quick Start Script:** Use \`./scripts/create-custom-app.sh\` for auto-generated app structure! + +--- + +## 🚀 Quick Start (Recommended) + +The official repo has a script that generates a complete app structure: + +\`\`\`bash +# Clone the official repo +git clone https://github.com/verily-src/workbench-app-devcontainers.git +cd workbench-app-devcontainers + +# Run the quick start script +./scripts/create-custom-app.sh my-app quay.io/jupyter/base-notebook 8888 jovyan /home/jovyan +\`\`\` + +This generates all required files in \`src/my-app/\` with correct structure. + +--- -## TL;DR - The Minimal Pattern That Works +## ⚠️ Critical Requirements + +### 1. File Structure (MUST follow this exactly) + +\`\`\` +your-repo/ +├── .devcontainer/ +│ └── devcontainer.json ← MUST be in .devcontainer/ folder! +├── docker-compose.yaml +├── Dockerfile +├── devcontainer-template.json +└── app/ + └── your_app.py +\`\`\` + +**⚠️ CRITICAL:** The \`devcontainer.json\` file MUST be inside a \`.devcontainer/\` folder, NOT at the repo root! + +### 2. Container Requirements Workbench custom apps need exactly **three things**: -1. Container named `application-server` -2. Connected to `app-network` (external Docker network) +1. Container named \`application-server\` +2. Connected to \`app-network\` (external Docker network) 3. HTTP server on a port -**That's it.** Everything else is optional (and often causes problems). - --- -## The Minimal Working Pattern (Copy This) +## The Working Pattern (Copy This) -### File 1: `.devcontainer.json` -```json +### File 1: \`.devcontainer/devcontainer.json\` + +**Location:** \`.devcontainer/devcontainer.json\` (NOT at root!) + +\`\`\`json { "name": "Your App Name", - "dockerComposeFile": "docker-compose.yaml", + "dockerComposeFile": "../docker-compose.yaml", "service": "app", "shutdownAction": "none", - "workspaceFolder": "/workspace", - "remoteUser": "root" + "workspaceFolder": "/app", + "remoteUser": "root", + "customizations": { + "workbench": { + "proxyTargetPort": 8080 + } + } } -``` +\`\`\` + +**⚠️ CRITICAL settings:** +- \`"dockerComposeFile": "../docker-compose.yaml"\` - Must go UP one level since we're in \`.devcontainer/\` +- \`"proxyTargetPort": 8080\` - REQUIRED for Workbench to know which port to proxy +- \`"workspaceFolder": "/app"\` - Should match WORKDIR in Dockerfile -### File 2: `docker-compose.yaml` -```yaml +### File 2: \`docker-compose.yaml\` + +**Location:** Repository root + +\`\`\`yaml services: app: container_name: "application-server" build: - context: ../.. - dockerfile: src/YOUR-APP-NAME/Dockerfile + context: . + dockerfile: Dockerfile restart: always ports: - "8080:8080" + volumes: + - .:/app:cached networks: - app-network networks: app-network: external: true -``` +\`\`\` + +**⚠️ CRITICAL settings:** +- \`container_name: "application-server"\` - Workbench looks for this exact name +- \`networks: app-network\` with \`external: true\` - Required for Workbench connectivity +- \`volumes: - .:/app:cached\` - Mounts code for live updates + +### File 3: \`Dockerfile\` -### File 3: `Dockerfile` -```dockerfile +\`\`\`dockerfile FROM python:3.11-slim WORKDIR /app -COPY src/YOUR-APP-NAME/app/requirements.txt . +COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -COPY src/YOUR-APP-NAME/app/ . +COPY . . EXPOSE 8080 -CMD ["python", "your_app.py"] -``` +# CRITICAL: Must bind to 0.0.0.0 for Workbench proxy +CMD ["python", "app.py"] +\`\`\` + +### File 4: \`devcontainer-template.json\` -### File 4: `devcontainer-template.json` -```json +\`\`\`json { "id": "your-app-name", "description": "Your app description", @@ -191,196 +252,125 @@ CMD ["python", "your_app.py"] "options": {}, "platforms": ["Any"] } -``` +\`\`\` --- -## Directory Structure +## Common Mistakes Checklist -``` -src/YOUR-APP-NAME/ -├── .devcontainer.json -├── devcontainer-template.json -├── docker-compose.yaml -├── Dockerfile -├── README.md -└── app/ - ├── your_app.py - ├── requirements.txt - └── (other files) -``` +Before deploying, verify: ---- +- [ ] \`devcontainer.json\` is in \`.devcontainer/\` folder (NOT at root) +- [ ] \`dockerComposeFile\` path starts with \`../\` (goes up from .devcontainer/) +- [ ] \`proxyTargetPort\` is set in customizations.workbench +- [ ] \`container_name\` is exactly \`"application-server"\` +- [ ] Network is \`app-network\` with \`external: true\` +- [ ] Flask/server binds to \`0.0.0.0\` (not \`localhost\`) +- [ ] Volume mount included for code updates -## What NOT To Do (Lessons Learned) +--- -### DON'T use complex base images unless needed -❌ `workbench-jupyter` base image - Has its own startup config that conflicts with CMD overrides -✅ `python:3.11-slim` - Clean, simple, no surprises +## ⚠️ Workbench App URLs (CRITICAL) -### DON'T use devcontainer features -❌ Features like `ghcr.io/dhoeric/features/google-cloud-cli` - Uses deprecated `apt-key`, fails on newer Debian -❌ Features like `workbench-tools` - Expect specific system packages -✅ Install what you need directly in the Dockerfile +**When accessing your app, you MUST use this format:** -### DON'T use postCreateCommand/postStartCommand -❌ `./startupscript/post-startup.sh` - Expects specific user/home structure, may fail -✅ Self-contained Dockerfile with everything built in +\`\`\` +https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] +\`\`\` -### DON'T use supervisor for multiple processes (unless truly needed) -❌ Supervisor + Jupyter + Flask - Complex, many failure points -✅ Single process serving everything (Flask can serve static files) +### Get App UUID: +\`\`\`bash +wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 +\`\`\` -### DON'T fight with Jupyter config -❌ Overriding CMD on workbench-jupyter image - Causes `root_dir`/`file_to_run` conflicts -✅ Don't use Jupyter at all if you don't need it +### ❌ WRONG Formats (Will fail) +\`\`\` +https://abc123-def456.workbench-app.verily.com/ ← WRONG +http://localhost:8080/ ← WRONG +\`\`\` --- -## Flask App: Serve Static Files Directly - -If your app has a Flask backend + static HTML, just have Flask serve everything: +## Flask App Example -```python -import os +\`\`\`python from flask import Flask from flask_cors import CORS -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -app = Flask(__name__, static_folder=SCRIPT_DIR, static_url_path='/static') +app = Flask(__name__) CORS(app) @app.route('/') -def serve_index(): - return app.send_static_file('index.html') - -# ... your other routes ... +def index(): + return '

Hello Workbench!

' if __name__ == '__main__': + # CRITICAL: host='0.0.0.0' required for Workbench proxy app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) -``` - -**No separate HTTP server needed. No supervisor. One process.** +\`\`\` --- ## Common Errors and Fixes -### Error: `apt-key: command not found` -**Cause:** Devcontainer feature uses deprecated apt-key on newer Debian -**Fix:** Remove the feature from .devcontainer.json, install directly in Dockerfile if needed - -### Error: `root_dir and file_to_run are incompatible` -**Cause:** Overriding CMD on workbench-jupyter base image conflicts with its config -**Fix:** Don't use workbench-jupyter. Use python:3.11-slim instead - -### Error: `supports_credentials in conjunction with origin '*'` -**Cause:** Flask-CORS config conflict -**Fix:** Just use `CORS(app)` with no options - -### Error: Container restart loop -**Cause:** Main process exits immediately -**Fix:** Make sure your CMD runs a long-lived process (Flask server, not a script that exits) - -### Error: `Application-server port is empty` -**Cause:** Container not exposing port correctly, or app crashing before binding -**Fix:** Check `docker logs application-server` to see the actual error +| Error | Cause | Fix | +|-------|-------|-----| +| App fails to create | \`devcontainer.json\` at root | Move to \`.devcontainer/devcontainer.json\` | +| Proxy can't reach app | Missing \`proxyTargetPort\` | Add to \`customizations.workbench\` | +| "Bad Request" error | Wrong URL format | Use \`workbench.verily.com/app/UUID/proxy/PORT/\` | +| Server not accessible | Bound to \`localhost\` | Change to \`host='0.0.0.0'\` | +| Container restart loop | Process exits immediately | Ensure server runs continuously | --- ## Deployment -### Deploy to Workbench In Workbench UI, create custom app with: -- **Repository:** `git@github.com:YOUR-ORG/YOUR-REPO.git` -- **Branch:** `your-branch` -- **Folder:** `src/YOUR-APP-NAME` - -### For faster deploys (optional): Push to GAR -```bash -# Build -cd src/YOUR-APP-NAME -docker compose build - -# Tag -export TAG="us-central1-docker.pkg.dev/PROJECT/REPO/NAME:$(date +'%Y%m%d')" -docker tag YOUR-APP-NAME-app:latest ${TAG} - -# Push -docker push ${TAG} - -# Update docker-compose.yaml to use image: instead of build: -``` +- **Repository:** \`https://github.com/YOUR-ORG/YOUR-REPO.git\` +- **Branch:** \`main\` +- **Folder:** \`.\` (root) or \`src/YOUR-APP-NAME\` if in monorepo --- ## Local Testing -```bash +\`\`\`bash # Create required network docker network create app-network # Build and run -cd src/YOUR-APP-NAME docker compose build docker compose up # Access at http://localhost:8080 -``` - ---- - -## Debugging on VM - -```bash -# SSH to VM, then: -docker logs application-server --tail 100 -docker exec -it application-server /bin/sh -docker ps -a -``` +\`\`\` --- ## Reference Implementations -All examples are in the public repo: https://github.com/verily-src/workbench-app-devcontainers +All examples: https://github.com/verily-src/workbench-app-devcontainers/tree/master/src -| App | Description | Complexity | -|-----|-------------|------------| -| `src/playground/` | Multi-service app with Caddy | Simple | -| `src/vscode/` | VS Code Server on port 8443 | Pre-built image | -| `src/r-analysis/` | RStudio on port 8787 | Pre-built image | -| `src/workbench-jupyter/` | JupyterLab with Workbench tools | Full-featured | +| App | Description | Port | +|-----|-------------|------| +| \`playground/\` | Simple multi-service example | 8080 | +| \`vscode/\` | VS Code Server | 8443 | +| \`r-analysis/\` | RStudio | 8787 | +| \`workbench-jupyter/\` | JupyterLab with tools | 8888 | --- -## When DO You Need Features? +## When to Use Features -Sometimes you genuinely need the full-featured approach: +Sometimes you need the full-featured approach: | Need | Solution | |------|----------| -| Workbench CLI (`wb`) | Use `workbench-tools` feature | -| LLM/MCP integration | Use `wb-mcp-server` feature | -| Pre-authenticated gcloud | Use `workbench-tools` feature | -| Jupyter notebooks | Use `workbench-jupyter` base image | - -**If you need these, accept the complexity.** But test thoroughly. - ---- - -## Key Insight - -The old guides suggested using `workbench-jupyter` base image + devcontainer features + startup scripts. This adds complexity that causes failures. - -The **playground pattern** proves you only need: -1. A container named `application-server` -2. On the `app-network` network -3. Serving HTTP on a port - -Everything else is optional convenience that often breaks. +| Workbench CLI (\`wb\`) | Use \`workbench-tools\` feature | +| LLM/MCP integration | Use \`wb-mcp-server\` feature | +| Pre-authenticated gcloud | Use \`workbench-tools\` feature | -**When in doubt, simplify.** +**If you need these, use the full \`workbench-app-devcontainers\` repo as your base.** SKILL_EOF # Create APP_TEMPLATES.md skill (full version, embedded) From 29b20833433209f3db9a8776c07d3534ecba5bc7 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 17:11:02 -0500 Subject: [PATCH 30/86] CRITICAL FIX: .devcontainer.json must be at repo ROOT --- features/src/llm-context/generate-context.sh | 31 ++++++++------------ 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 76a9e0383..54fb796f8 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -147,8 +147,7 @@ This generates all required files in \`src/my-app/\` with correct structure. \`\`\` your-repo/ -├── .devcontainer/ -│ └── devcontainer.json ← MUST be in .devcontainer/ folder! +├── .devcontainer.json ← MUST be at repo ROOT (not in a folder!) ├── docker-compose.yaml ├── Dockerfile ├── devcontainer-template.json @@ -156,7 +155,7 @@ your-repo/ └── your_app.py \`\`\` -**⚠️ CRITICAL:** The \`devcontainer.json\` file MUST be inside a \`.devcontainer/\` folder, NOT at the repo root! +**⚠️ CRITICAL:** Workbench expects \`.devcontainer.json\` at the **repo ROOT**, NOT inside a \`.devcontainer/\` folder! ### 2. Container Requirements @@ -169,30 +168,25 @@ Workbench custom apps need exactly **three things**: ## The Working Pattern (Copy This) -### File 1: \`.devcontainer/devcontainer.json\` +### File 1: \`.devcontainer.json\` -**Location:** \`.devcontainer/devcontainer.json\` (NOT at root!) +**Location:** Repo ROOT (same level as docker-compose.yaml) \`\`\`json { "name": "Your App Name", - "dockerComposeFile": "../docker-compose.yaml", + "dockerComposeFile": "docker-compose.yaml", "service": "app", "shutdownAction": "none", "workspaceFolder": "/app", - "remoteUser": "root", - "customizations": { - "workbench": { - "proxyTargetPort": 8080 - } - } + "remoteUser": "root" } \`\`\` **⚠️ CRITICAL settings:** -- \`"dockerComposeFile": "../docker-compose.yaml"\` - Must go UP one level since we're in \`.devcontainer/\` -- \`"proxyTargetPort": 8080\` - REQUIRED for Workbench to know which port to proxy +- \`"dockerComposeFile": "docker-compose.yaml"\` - Same directory (both at root) - \`"workspaceFolder": "/app"\` - Should match WORKDIR in Dockerfile +- File MUST be named \`.devcontainer.json\` at repo root ### File 2: \`docker-compose.yaml\` @@ -260,9 +254,8 @@ CMD ["python", "app.py"] Before deploying, verify: -- [ ] \`devcontainer.json\` is in \`.devcontainer/\` folder (NOT at root) -- [ ] \`dockerComposeFile\` path starts with \`../\` (goes up from .devcontainer/) -- [ ] \`proxyTargetPort\` is set in customizations.workbench +- [ ] \`.devcontainer.json\` is at repo ROOT (NOT in a folder!) +- [ ] \`dockerComposeFile\` is \`"docker-compose.yaml"\` (same directory) - [ ] \`container_name\` is exactly \`"application-server"\` - [ ] Network is \`app-network\` with \`external: true\` - [ ] Flask/server binds to \`0.0.0.0\` (not \`localhost\`) @@ -315,8 +308,8 @@ if __name__ == '__main__': | Error | Cause | Fix | |-------|-------|-----| -| App fails to create | \`devcontainer.json\` at root | Move to \`.devcontainer/devcontainer.json\` | -| Proxy can't reach app | Missing \`proxyTargetPort\` | Add to \`customizations.workbench\` | +| App fails to create / No container | \`devcontainer.json\` in wrong location | Move to repo ROOT as \`.devcontainer.json\` | +| App fails to create | \`devcontainer.json\` in \`.devcontainer/\` folder | Workbench needs it at ROOT, not in folder | | "Bad Request" error | Wrong URL format | Use \`workbench.verily.com/app/UUID/proxy/PORT/\` | | Server not accessible | Bound to \`localhost\` | Change to \`host='0.0.0.0'\` | | Container restart loop | Process exits immediately | Ensure server runs continuously | From 3bc0deb8a710e755939d3791338fe36ac709cb01 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 17:12:37 -0500 Subject: [PATCH 31/86] Update all CUSTOM_APP.md skills with correct .devcontainer.json placement CRITICAL: Workbench expects .devcontainer.json at repo ROOT, not in folder. Updated standalone skill file to match embedded version. --- features/src/llm-context/skills/CUSTOM_APP.md | 281 ++++++++---------- 1 file changed, 122 insertions(+), 159 deletions(-) diff --git a/features/src/llm-context/skills/CUSTOM_APP.md b/features/src/llm-context/skills/CUSTOM_APP.md index 94648e764..4e6518393 100644 --- a/features/src/llm-context/skills/CUSTOM_APP.md +++ b/features/src/llm-context/skills/CUSTOM_APP.md @@ -2,45 +2,99 @@ **Practical guide for creating simple, reliable Workbench apps.** -> **When to use this guide:** For simple apps (Flask APIs, static sites, custom tools). -> For apps needing Workbench CLI, gcloud, or Jupyter, see the [full-featured approach](https://github.com/verily-src/workbench-app-devcontainers). +> **Official Reference:** https://github.com/verily-src/workbench-app-devcontainers +> +> **Quick Start Script:** Use `./scripts/create-custom-app.sh` for auto-generated app structure! -## TL;DR - The Minimal Pattern That Works +--- + +## 🚀 Quick Start (Recommended) + +The official repo has a script that generates a complete app structure: + +```bash +# Clone the official repo +git clone https://github.com/verily-src/workbench-app-devcontainers.git +cd workbench-app-devcontainers + +# Run the quick start script +./scripts/create-custom-app.sh my-app quay.io/jupyter/base-notebook 8888 jovyan /home/jovyan +``` + +This generates all required files in `src/my-app/` with correct structure. + +**Arguments:** +- `app-name`: Name of your app +- `docker-image`: Base image (e.g., `python:3.11-slim`, `jupyter/base-notebook`) +- `port`: Port your app exposes (e.g., `8080`, `8888`) +- `username`: User inside container (default: `root`) +- `home-dir`: Home directory (default: `/root`) + +--- + +## ⚠️ Critical Requirements + +### 1. File Structure (MUST follow this exactly) + +``` +your-repo/ +├── .devcontainer.json ← MUST be at repo ROOT (not in a folder!) +├── docker-compose.yaml +├── Dockerfile +├── devcontainer-template.json +└── app/ + └── your_app.py +``` + +**⚠️ CRITICAL:** Workbench expects `.devcontainer.json` at the **repo ROOT**, NOT inside a `.devcontainer/` folder! + +### 2. Container Requirements Workbench custom apps need exactly **three things**: 1. Container named `application-server` 2. Connected to `app-network` (external Docker network) 3. HTTP server on a port -**That's it.** Everything else is optional (and often causes problems). - --- -## The Minimal Working Pattern (Copy This) +## The Working Pattern (Copy This) ### File 1: `.devcontainer.json` + +**Location:** Repo ROOT (same level as docker-compose.yaml) + ```json { "name": "Your App Name", "dockerComposeFile": "docker-compose.yaml", "service": "app", "shutdownAction": "none", - "workspaceFolder": "/workspace", + "workspaceFolder": "/app", "remoteUser": "root" } ``` +**⚠️ CRITICAL settings:** +- `"dockerComposeFile": "docker-compose.yaml"` - Same directory (both at root) +- `"workspaceFolder": "/app"` - Should match WORKDIR in Dockerfile +- File MUST be named `.devcontainer.json` at repo root + ### File 2: `docker-compose.yaml` + +**Location:** Repository root + ```yaml services: app: container_name: "application-server" build: - context: ../.. - dockerfile: src/YOUR-APP-NAME/Dockerfile + context: . + dockerfile: Dockerfile restart: always ports: - "8080:8080" + volumes: + - .:/app:cached networks: - app-network @@ -49,23 +103,31 @@ networks: external: true ``` +**⚠️ CRITICAL settings:** +- `container_name: "application-server"` - Workbench looks for this exact name +- `networks: app-network` with `external: true` - Required for Workbench connectivity +- `volumes: - .:/app:cached` - Mounts code for live updates + ### File 3: `Dockerfile` + ```dockerfile FROM python:3.11-slim WORKDIR /app -COPY src/YOUR-APP-NAME/app/requirements.txt . +COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -COPY src/YOUR-APP-NAME/app/ . +COPY . . EXPOSE 8080 -CMD ["python", "your_app.py"] +# CRITICAL: Must bind to 0.0.0.0 for Workbench proxy +CMD ["python", "app.py"] ``` ### File 4: `devcontainer-template.json` + ```json { "id": "your-app-name", @@ -79,149 +141,78 @@ CMD ["python", "your_app.py"] --- -## Directory Structure +## Common Mistakes Checklist -``` -src/YOUR-APP-NAME/ -├── .devcontainer.json -├── devcontainer-template.json -├── docker-compose.yaml -├── Dockerfile -├── README.md -└── app/ - ├── your_app.py - ├── requirements.txt - └── (other files) -``` +Before deploying, verify: ---- +- [ ] `.devcontainer.json` is at repo ROOT (NOT in a folder!) +- [ ] `dockerComposeFile` is `"docker-compose.yaml"` (same directory) +- [ ] `container_name` is exactly `"application-server"` +- [ ] Network is `app-network` with `external: true` +- [ ] Flask/server binds to `0.0.0.0` (not `localhost`) +- [ ] Volume mount included for code updates -## What NOT To Do (Lessons Learned) +--- -### DON'T use complex base images unless needed -❌ `workbench-jupyter` base image - Has its own startup config that conflicts with CMD overrides -✅ `python:3.11-slim` - Clean, simple, no surprises +## ⚠️ Workbench App URLs (CRITICAL) -### DON'T use devcontainer features -❌ Features like `ghcr.io/dhoeric/features/google-cloud-cli` - Uses deprecated `apt-key`, fails on newer Debian -❌ Features like `workbench-tools` - Expect specific system packages -✅ Install what you need directly in the Dockerfile +**When accessing your app, you MUST use this format:** -### DON'T use postCreateCommand/postStartCommand -❌ `./startupscript/post-startup.sh` - Expects specific user/home structure, may fail -✅ Self-contained Dockerfile with everything built in +``` +https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] +``` -### DON'T use supervisor for multiple processes (unless truly needed) -❌ Supervisor + Jupyter + Flask - Complex, many failure points -✅ Single process serving everything (Flask can serve static files) +### Get App UUID: +```bash +wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 +``` -### DON'T fight with Jupyter config -❌ Overriding CMD on workbench-jupyter image - Causes `root_dir`/`file_to_run` conflicts -✅ Don't use Jupyter at all if you don't need it +### ❌ WRONG Formats (Will fail) +``` +https://abc123-def456.workbench-app.verily.com/ ← WRONG +http://localhost:8080/ ← WRONG +``` --- -## Flask App: Serve Static Files Directly - -If your app has a Flask backend + static HTML, just have Flask serve everything: +## Flask App Example ```python -import os from flask import Flask from flask_cors import CORS -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -app = Flask(__name__, static_folder=SCRIPT_DIR, static_url_path='/static') +app = Flask(__name__) CORS(app) @app.route('/') -def serve_index(): - return app.send_static_file('index.html') - -# ... your other routes ... +def index(): + return '

Hello Workbench!

' if __name__ == '__main__': + # CRITICAL: host='0.0.0.0' required for Workbench proxy app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) ``` -**No separate HTTP server needed. No supervisor. One process.** - --- ## Common Errors and Fixes -### Error: `apt-key: command not found` -**Cause:** Devcontainer feature uses deprecated apt-key on newer Debian -**Fix:** Remove the feature from .devcontainer.json, install directly in Dockerfile if needed - -### Error: `root_dir and file_to_run are incompatible` -**Cause:** Overriding CMD on workbench-jupyter base image conflicts with its config -**Fix:** Don't use workbench-jupyter. Use python:3.11-slim instead - -### Error: `supports_credentials in conjunction with origin '*'` -**Cause:** Flask-CORS config conflict -**Fix:** Just use `CORS(app)` with no options - -### Error: Container restart loop -**Cause:** Main process exits immediately -**Fix:** Make sure your CMD runs a long-lived process (Flask server, not a script that exits) - -### Error: `Application-server port is empty` -**Cause:** Container not exposing port correctly, or app crashing before binding -**Fix:** Check `docker logs application-server` to see the actual error +| Error | Cause | Fix | +|-------|-------|-----| +| App fails to create / No container | `devcontainer.json` in wrong location | Move to repo ROOT as `.devcontainer.json` | +| App fails to create | `devcontainer.json` in `.devcontainer/` folder | Workbench needs it at ROOT, not in folder | +| "Bad Request" error | Wrong URL format | Use `workbench.verily.com/app/UUID/proxy/PORT/` | +| Server not accessible | Bound to `localhost` | Change to `host='0.0.0.0'` | +| Container restart loop | Process exits immediately | Ensure server runs continuously | --- ## Deployment -### Deploy to Workbench In Workbench UI, create custom app with: -- **Repository:** `git@github.com:YOUR-ORG/YOUR-REPO.git` -- **Branch:** `your-branch` -- **Folder:** `src/YOUR-APP-NAME` - -### For faster deploys (optional): Push to GAR -```bash -# Build -cd src/YOUR-APP-NAME -docker compose build - -# Tag -export TAG="us-central1-docker.pkg.dev/PROJECT/REPO/NAME:$(date +'%Y%m%d')" -docker tag YOUR-APP-NAME-app:latest ${TAG} - -# Push -docker push ${TAG} - -# Update docker-compose.yaml to use image: instead of build: -``` - ---- - -## ⚠️ Workbench App URLs (CRITICAL) - -**When accessing your app or generating URLs for users, you MUST use this format:** - -``` -https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] -``` - -### Correct Examples -``` -https://workbench.verily.com/app/abc123-def456/proxy/8080/ -https://workbench.verily.com/app/abc123-def456/proxy/8501/dashboard -``` - -### ❌ WRONG Formats (Will fail with "Bad Request") -``` -https://abc123-def456.workbench-app.verily.com/ ← WRONG -http://localhost:8080/ ← WRONG -file:///home/jupyter/dashboard.html ← WRONG (JS blocked) -``` - -**Always use the proxy URL format. Never use localhost or custom domain patterns.** - -> **📊 Building dashboards or HTML visualizations?** See the "Workbench URLs, Dashboards & Interactive Content" section in `~/CLAUDE.md` for how to serve HTML files with JavaScript (requires HTTP server). +- **Repository:** `https://github.com/YOUR-ORG/YOUR-REPO.git` +- **Branch:** `main` +- **Folder:** `.` (root) or `src/YOUR-APP-NAME` if in monorepo --- @@ -232,7 +223,6 @@ file:///home/jupyter/dashboard.html ← WRONG (JS blocked) docker network create app-network # Build and run -cd src/YOUR-APP-NAME docker compose build docker compose up @@ -241,54 +231,27 @@ docker compose up --- -## Debugging on VM - -```bash -# SSH to VM, then: -docker logs application-server --tail 100 -docker exec -it application-server /bin/sh -docker ps -a -``` - ---- - ## Reference Implementations -All examples are in the public repo: https://github.com/verily-src/workbench-app-devcontainers +All examples: https://github.com/verily-src/workbench-app-devcontainers/tree/master/src -| App | Description | Complexity | -|-----|-------------|------------| -| `src/playground/` | Multi-service app with Caddy | Simple | -| `src/vscode/` | VS Code Server on port 8443 | Pre-built image | -| `src/r-analysis/` | RStudio on port 8787 | Pre-built image | -| `src/workbench-jupyter/` | JupyterLab with Workbench tools | Full-featured | +| App | Description | Port | +|-----|-------------|------| +| `playground/` | Simple multi-service example | 8080 | +| `vscode/` | VS Code Server | 8443 | +| `r-analysis/` | RStudio | 8787 | +| `workbench-jupyter/` | JupyterLab with tools | 8888 | --- -## When DO You Need Features? +## When to Use Features -Sometimes you genuinely need the full-featured approach: +Sometimes you need the full-featured approach: | Need | Solution | |------|----------| | Workbench CLI (`wb`) | Use `workbench-tools` feature | | LLM/MCP integration | Use `wb-mcp-server` feature | | Pre-authenticated gcloud | Use `workbench-tools` feature | -| Jupyter notebooks | Use `workbench-jupyter` base image | - -**If you need these, accept the complexity.** But test thoroughly. - ---- - -## Key Insight - -The old guides suggested using `workbench-jupyter` base image + devcontainer features + startup scripts. This adds complexity that causes failures. - -The **playground pattern** proves you only need: -1. A container named `application-server` -2. On the `app-network` network -3. Serving HTTP on a port - -Everything else is optional convenience that often breaks. -**When in doubt, simplify.** +**If you need these, use the full `workbench-app-devcontainers` repo as your base.** From 77695f99514e0a84a59e3e5db3907cf32cef73cb Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 17:19:41 -0500 Subject: [PATCH 32/86] Comprehensive CUSTOM_APP.md update: Two patterns for stability Added clear distinction between: - Pattern 1: Simple Standalone (for dashboards, self-contained apps) - Pattern 2: Full-Featured Monorepo (for wb CLI, bucket mounting) Key insights from working examples: - r-shiny-demo-app, simple-dashboard-app: Pattern 1 - workbench-app-devcontainers apps: Pattern 2 Critical requirements documented: - .devcontainer.json at ROOT for Pattern 1 - cap_add, devices, security_opt sections - container_name: application-server - networks: app-network with external: true Added decision flowchart and comprehensive examples. --- features/src/llm-context/skills/CUSTOM_APP.md | 343 +++++++++++------- 1 file changed, 219 insertions(+), 124 deletions(-) diff --git a/features/src/llm-context/skills/CUSTOM_APP.md b/features/src/llm-context/skills/CUSTOM_APP.md index 4e6518393..1586836ac 100644 --- a/features/src/llm-context/skills/CUSTOM_APP.md +++ b/features/src/llm-context/skills/CUSTOM_APP.md @@ -1,88 +1,90 @@ # Creating Custom Workbench Apps -**Practical guide for creating simple, reliable Workbench apps.** - > **Official Reference:** https://github.com/verily-src/workbench-app-devcontainers -> -> **Quick Start Script:** Use `./scripts/create-custom-app.sh` for auto-generated app structure! --- -## 🚀 Quick Start (Recommended) - -The official repo has a script that generates a complete app structure: - -```bash -# Clone the official repo -git clone https://github.com/verily-src/workbench-app-devcontainers.git -cd workbench-app-devcontainers +## ⚠️ Choose Your Pattern First -# Run the quick start script -./scripts/create-custom-app.sh my-app quay.io/jupyter/base-notebook 8888 jovyan /home/jovyan -``` +There are **TWO valid patterns** for Workbench custom apps: -This generates all required files in `src/my-app/` with correct structure. +| Pattern | Use When | Complexity | +|---------|----------|------------| +| **Simple (Standalone)** | Self-contained apps, no `wb` CLI needed | Minimal | +| **Full-Featured (Monorepo)** | Need `wb` CLI, bucket mounting, features | Requires monorepo structure | -**Arguments:** -- `app-name`: Name of your app -- `docker-image`: Base image (e.g., `python:3.11-slim`, `jupyter/base-notebook`) -- `port`: Port your app exposes (e.g., `8080`, `8888`) -- `username`: User inside container (default: `root`) -- `home-dir`: Home directory (default: `/root`) +**Most dashboards and simple apps should use Pattern 1.** --- -## ⚠️ Critical Requirements +## Pattern 1: Simple Standalone App (Recommended for Dashboards) + +Use this for Flask, Streamlit, or any self-contained app. -### 1. File Structure (MUST follow this exactly) +### Working Examples +- https://github.com/aculotti-verily/simple-dashboard-app +- https://github.com/aculotti-verily/r-shiny-demo-app +### File Structure ``` your-repo/ -├── .devcontainer.json ← MUST be at repo ROOT (not in a folder!) +├── .devcontainer.json ← At repo ROOT! ├── docker-compose.yaml -├── Dockerfile ├── devcontainer-template.json -└── app/ - └── your_app.py +├── requirements.txt ← (or package.json, etc.) +└── app.py ← Your application code ``` -**⚠️ CRITICAL:** Workbench expects `.devcontainer.json` at the **repo ROOT**, NOT inside a `.devcontainer/` folder! - -### 2. Container Requirements - -Workbench custom apps need exactly **three things**: -1. Container named `application-server` -2. Connected to `app-network` (external Docker network) -3. HTTP server on a port - ---- - -## The Working Pattern (Copy This) - ### File 1: `.devcontainer.json` -**Location:** Repo ROOT (same level as docker-compose.yaml) - ```json { - "name": "Your App Name", + "name": "My App", "dockerComposeFile": "docker-compose.yaml", "service": "app", "shutdownAction": "none", - "workspaceFolder": "/app", + "workspaceFolder": "/workspace", "remoteUser": "root" } ``` -**⚠️ CRITICAL settings:** -- `"dockerComposeFile": "docker-compose.yaml"` - Same directory (both at root) -- `"workspaceFolder": "/app"` - Should match WORKDIR in Dockerfile -- File MUST be named `.devcontainer.json` at repo root +**Key points:** +- NO `postCreateCommand` or `postStartCommand` +- NO `features` section +- File MUST be at repo ROOT (not in a folder) ### File 2: `docker-compose.yaml` -**Location:** Repository root +**Option A: Use image directly + install deps in command (simplest)** +```yaml +services: + app: + container_name: "application-server" + image: "python:3.11-slim" + restart: always + working_dir: /workspace + command: > + bash -c "pip install -r requirements.txt && + python app.py" + volumes: + - .:/workspace:cached + ports: + - 8080:8080 + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + +networks: + app-network: + external: true +``` +**Option B: Build from Dockerfile (if you need custom setup)** ```yaml services: app: @@ -91,72 +93,122 @@ services: context: . dockerfile: Dockerfile restart: always - ports: - - "8080:8080" volumes: - - .:/app:cached + - .:/workspace:cached + ports: + - 8080:8080 networks: - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined networks: app-network: external: true ``` -**⚠️ CRITICAL settings:** -- `container_name: "application-server"` - Workbench looks for this exact name -- `networks: app-network` with `external: true` - Required for Workbench connectivity -- `volumes: - .:/app:cached` - Mounts code for live updates - -### File 3: `Dockerfile` - -```dockerfile -FROM python:3.11-slim - -WORKDIR /app - -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -COPY . . - -EXPOSE 8080 - -# CRITICAL: Must bind to 0.0.0.0 for Workbench proxy -CMD ["python", "app.py"] -``` - -### File 4: `devcontainer-template.json` - +### File 3: `devcontainer-template.json` ```json { - "id": "your-app-name", - "description": "Your app description", + "id": "my-app", "version": "1.0.0", - "name": "Your App Name", + "name": "My App", + "description": "Description of my app", "options": {}, "platforms": ["Any"] } ``` +### ⚠️ Critical Requirements + +- [ ] `.devcontainer.json` at repo ROOT (not in `.devcontainer/` folder!) +- [ ] `container_name: "application-server"` (exact name) +- [ ] `networks: app-network` with `external: true` +- [ ] Server binds to `0.0.0.0` (not `localhost`) +- [ ] Include `cap_add`, `devices`, and `security_opt` sections + --- -## Common Mistakes Checklist +## Pattern 2: Full-Featured App (Monorepo) + +Use this when you need: +- Workbench CLI (`wb`) +- Automatic bucket mounting +- Pre-authenticated `gcloud`/`aws` +- Devcontainer features + +### How to Use +1. **Fork** https://github.com/verily-src/workbench-app-devcontainers +2. Run the quick start script: + ```bash + ./scripts/create-custom-app.sh my-app python:3.11-slim 8080 root /root + ``` +3. Customize the generated app in `src/my-app/` +4. Push to your fork +5. Create custom app in Workbench pointing to `src/my-app` + +### Structure (in monorepo) +``` +your-fork/ +├── .devcontainer/ +│ └── features/ ← Symlinks to features/src/ +├── features/ +│ └── src/ +│ └── workbench-tools/ +├── startupscript/ +│ ├── post-startup.sh +│ └── remount-on-restart.sh +└── src/ + └── my-app/ + ├── .devcontainer.json + ├── docker-compose.yaml + └── devcontainer-template.json +``` -Before deploying, verify: +### App's `.devcontainer.json` (Pattern 2) +```json +{ + "name": "my-app", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": [ + "./startupscript/post-startup.sh", + "root", + "/root", + "${templateOption:cloud}", + "${templateOption:login}" + ], + "postStartCommand": [ + "./startupscript/remount-on-restart.sh", + "root", + "/root", + "${templateOption:cloud}", + "${templateOption:login}" + ], + "features": { + "./.devcontainer/features/workbench-tools": { + "cloud": "${templateOption:cloud}", + "username": "root", + "userHomeDir": "/root" + } + }, + "remoteUser": "root" +} +``` -- [ ] `.devcontainer.json` is at repo ROOT (NOT in a folder!) -- [ ] `dockerComposeFile` is `"docker-compose.yaml"` (same directory) -- [ ] `container_name` is exactly `"application-server"` -- [ ] Network is `app-network` with `external: true` -- [ ] Flask/server binds to `0.0.0.0` (not `localhost`) -- [ ] Volume mount included for code updates +**When using Pattern 2, the Folder field in Workbench UI should be `src/my-app`** --- ## ⚠️ Workbench App URLs (CRITICAL) -**When accessing your app, you MUST use this format:** +**When accessing your app, MUST use this format:** ``` https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] @@ -167,7 +219,7 @@ https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 ``` -### ❌ WRONG Formats (Will fail) +### ❌ WRONG Formats ``` https://abc123-def456.workbench-app.verily.com/ ← WRONG http://localhost:8080/ ← WRONG @@ -175,8 +227,22 @@ http://localhost:8080/ ← WRONG --- -## Flask App Example +## Common Errors and Fixes + +| Error | Cause | Fix | +|-------|-------|-----| +| App fails to create / No container | `.devcontainer.json` in wrong location | Move to repo ROOT | +| App fails to create | Missing `startupscript/` in monorepo | Use Pattern 1, or fork official repo | +| Container restart loop | Process exits immediately | Ensure server runs continuously | +| Server not accessible | Bound to `localhost` | Change to `host='0.0.0.0'` | +| "Bad Request" error | Wrong URL format | Use proxy URL format | +| Features not found | Using Pattern 2 without monorepo structure | Use Pattern 1 for standalone apps | + +--- + +## Flask App Example (Pattern 1) +**app.py:** ```python from flask import Flask from flask_cors import CORS @@ -189,30 +255,56 @@ def index(): return '

Hello Workbench!

' if __name__ == '__main__': - # CRITICAL: host='0.0.0.0' required for Workbench proxy app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) ``` +**requirements.txt:** +``` +flask>=3.0.0 +flask-cors>=4.0.0 +``` + --- -## Common Errors and Fixes +## Streamlit Example (Pattern 1) -| Error | Cause | Fix | -|-------|-------|-----| -| App fails to create / No container | `devcontainer.json` in wrong location | Move to repo ROOT as `.devcontainer.json` | -| App fails to create | `devcontainer.json` in `.devcontainer/` folder | Workbench needs it at ROOT, not in folder | -| "Bad Request" error | Wrong URL format | Use `workbench.verily.com/app/UUID/proxy/PORT/` | -| Server not accessible | Bound to `localhost` | Change to `host='0.0.0.0'` | -| Container restart loop | Process exits immediately | Ensure server runs continuously | +**docker-compose.yaml:** +```yaml +services: + app: + container_name: "application-server" + image: "python:3.11-slim" + restart: always + working_dir: /workspace + command: > + bash -c "pip install -r requirements.txt && + streamlit run app.py --server.port=8501 --server.address=0.0.0.0" + volumes: + - .:/workspace:cached + ports: + - 8501:8501 + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined + +networks: + app-network: + external: true +``` --- ## Deployment -In Workbench UI, create custom app with: +In Workbench UI: - **Repository:** `https://github.com/YOUR-ORG/YOUR-REPO.git` - **Branch:** `main` -- **Folder:** `.` (root) or `src/YOUR-APP-NAME` if in monorepo +- **Folder:** `.` (Pattern 1) or `src/my-app` (Pattern 2) --- @@ -223,35 +315,38 @@ In Workbench UI, create custom app with: docker network create app-network # Build and run -docker compose build -docker compose up +docker compose up --build -# Access at http://localhost:8080 +# Access at http://localhost:PORT ``` --- ## Reference Implementations -All examples: https://github.com/verily-src/workbench-app-devcontainers/tree/master/src - -| App | Description | Port | -|-----|-------------|------| -| `playground/` | Simple multi-service example | 8080 | -| `vscode/` | VS Code Server | 8443 | -| `r-analysis/` | RStudio | 8787 | -| `workbench-jupyter/` | JupyterLab with tools | 8888 | +| App | Pattern | Port | Description | +|-----|---------|------|-------------| +| [simple-dashboard-app](https://github.com/aculotti-verily/simple-dashboard-app) | 1 | 8501 | Streamlit dashboard | +| [r-shiny-demo-app](https://github.com/aculotti-verily/r-shiny-demo-app) | 1 | 8080 | RShiny with Caddy | +| [playground](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/playground) | 1 | 8080 | Multi-service example | +| [workbench-jupyter](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/workbench-jupyter-docker) | 2 | 8888 | Full JupyterLab | +| [r-analysis](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/r-analysis) | 2 | 8787 | RStudio with features | --- -## When to Use Features - -Sometimes you need the full-featured approach: - -| Need | Solution | -|------|----------| -| Workbench CLI (`wb`) | Use `workbench-tools` feature | -| LLM/MCP integration | Use `wb-mcp-server` feature | -| Pre-authenticated gcloud | Use `workbench-tools` feature | +## Decision Flowchart -**If you need these, use the full `workbench-app-devcontainers` repo as your base.** +``` +Do you need wb CLI, bucket mounting, or gcloud auth? + │ + ├── NO → Use Pattern 1 (Simple Standalone) + │ - Create single repo + │ - .devcontainer.json at ROOT + │ - No features, no startup scripts + │ + └── YES → Use Pattern 2 (Full-Featured Monorepo) + - Fork verily-src/workbench-app-devcontainers + - Run ./scripts/create-custom-app.sh + - App goes in src/my-app/ + - Folder field = "src/my-app" +``` From d5cc8c88a3d48e0a43497f0a64677e93db5af36d Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 17:44:52 -0500 Subject: [PATCH 33/86] Update CUSTOM_APP.md based on working examples analysis Patterns from working apps: - clinical-abstraction-demo: Minimal (no volumes, no cap_add) - simple-dashboard-app: Image + command approach - r-shiny-demo-app: Multi-container with Caddy proxy - shiny-aws-ce: Full-featured with startup scripts Added reference implementations table with links. --- features/src/llm-context/skills/CUSTOM_APP.md | 299 +++++++----------- 1 file changed, 111 insertions(+), 188 deletions(-) diff --git a/features/src/llm-context/skills/CUSTOM_APP.md b/features/src/llm-context/skills/CUSTOM_APP.md index 1586836ac..9b1ac34a0 100644 --- a/features/src/llm-context/skills/CUSTOM_APP.md +++ b/features/src/llm-context/skills/CUSTOM_APP.md @@ -1,38 +1,31 @@ # Creating Custom Workbench Apps > **Official Reference:** https://github.com/verily-src/workbench-app-devcontainers +> **Quick Start:** Use `./scripts/create-custom-app.sh` for auto-generated app structure --- -## ⚠️ Choose Your Pattern First +## ⚠️ Choose Your Pattern -There are **TWO valid patterns** for Workbench custom apps: - -| Pattern | Use When | Complexity | -|---------|----------|------------| -| **Simple (Standalone)** | Self-contained apps, no `wb` CLI needed | Minimal | -| **Full-Featured (Monorepo)** | Need `wb` CLI, bucket mounting, features | Requires monorepo structure | - -**Most dashboards and simple apps should use Pattern 1.** +| Pattern | Use When | Example | +|---------|----------|---------| +| **Minimal (Standalone)** | Simple apps, no cloud resources | `clinical-abstraction-demo` | +| **Full-Featured (Monorepo)** | Need `wb` CLI, bucket mounting | Fork official repo | --- -## Pattern 1: Simple Standalone App (Recommended for Dashboards) +## Pattern 1: Minimal Standalone App -Use this for Flask, Streamlit, or any self-contained app. - -### Working Examples -- https://github.com/aculotti-verily/simple-dashboard-app -- https://github.com/aculotti-verily/r-shiny-demo-app +Based on working examples: `clinical-abstraction-demo`, `simple-dashboard-app` ### File Structure ``` your-repo/ -├── .devcontainer.json ← At repo ROOT! +├── .devcontainer.json ← At repo ROOT ├── docker-compose.yaml +├── Dockerfile ├── devcontainer-template.json -├── requirements.txt ← (or package.json, etc.) -└── app.py ← Your application code +└── app.py (or app/) ``` ### File 1: `.devcontainer.json` @@ -48,51 +41,38 @@ your-repo/ } ``` -**Key points:** -- NO `postCreateCommand` or `postStartCommand` -- NO `features` section -- File MUST be at repo ROOT (not in a folder) - ### File 2: `docker-compose.yaml` -**Option A: Use image directly + install deps in command (simplest)** +**Minimal (from clinical-abstraction-demo):** ```yaml services: app: container_name: "application-server" - image: "python:3.11-slim" + build: + context: . + dockerfile: Dockerfile restart: always - working_dir: /workspace - command: > - bash -c "pip install -r requirements.txt && - python app.py" - volumes: - - .:/workspace:cached ports: - - 8080:8080 + - "8080:8080" networks: - app-network - cap_add: - - SYS_ADMIN - devices: - - /dev/fuse - security_opt: - - apparmor:unconfined networks: app-network: external: true ``` -**Option B: Build from Dockerfile (if you need custom setup)** +**Alternative: Use image directly (from simple-dashboard-app):** ```yaml services: app: container_name: "application-server" - build: - context: . - dockerfile: Dockerfile + image: "python:3.11-slim" restart: always + working_dir: /workspace + command: > + bash -c "pip install -r requirements.txt && + python app.py" volumes: - .:/workspace:cached ports: @@ -111,138 +91,109 @@ networks: external: true ``` -### File 3: `devcontainer-template.json` +### File 3: `Dockerfile` + +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8080 + +CMD ["python", "app.py"] +``` + +### File 4: `devcontainer-template.json` + ```json { "id": "my-app", "version": "1.0.0", "name": "My App", - "description": "Description of my app", + "description": "Description", "options": {}, "platforms": ["Any"] } ``` -### ⚠️ Critical Requirements +--- -- [ ] `.devcontainer.json` at repo ROOT (not in `.devcontainer/` folder!) -- [ ] `container_name: "application-server"` (exact name) -- [ ] `networks: app-network` with `external: true` -- [ ] Server binds to `0.0.0.0` (not `localhost`) -- [ ] Include `cap_add`, `devices`, and `security_opt` sections +## Pattern 2: Multi-Container with Caddy Proxy ---- +Based on `r-shiny-demo-app` - useful when your app needs a reverse proxy. -## Pattern 2: Full-Featured App (Monorepo) +```yaml +services: + application-server: + image: caddy:2.11-alpine + container_name: application-server + ports: + - "8080:8080" + volumes: + - ./Caddyfile:/etc/caddy/Caddyfile + networks: + - app-network + - internal-network -Use this when you need: -- Workbench CLI (`wb`) -- Automatic bucket mounting -- Pre-authenticated `gcloud`/`aws` -- Devcontainer features + my-app: + build: + context: . + dockerfile: Dockerfile + container_name: my-app + ports: + - "3000:3000" + networks: + - internal-network -### How to Use -1. **Fork** https://github.com/verily-src/workbench-app-devcontainers -2. Run the quick start script: - ```bash - ./scripts/create-custom-app.sh my-app python:3.11-slim 8080 root /root - ``` -3. Customize the generated app in `src/my-app/` -4. Push to your fork -5. Create custom app in Workbench pointing to `src/my-app` - -### Structure (in monorepo) -``` -your-fork/ -├── .devcontainer/ -│ └── features/ ← Symlinks to features/src/ -├── features/ -│ └── src/ -│ └── workbench-tools/ -├── startupscript/ -│ ├── post-startup.sh -│ └── remount-on-restart.sh -└── src/ - └── my-app/ - ├── .devcontainer.json - ├── docker-compose.yaml - └── devcontainer-template.json +networks: + app-network: + external: true + internal-network: + driver: bridge ``` -### App's `.devcontainer.json` (Pattern 2) -```json -{ - "name": "my-app", - "dockerComposeFile": "docker-compose.yaml", - "service": "app", - "shutdownAction": "none", - "workspaceFolder": "/workspace", - "postCreateCommand": [ - "./startupscript/post-startup.sh", - "root", - "/root", - "${templateOption:cloud}", - "${templateOption:login}" - ], - "postStartCommand": [ - "./startupscript/remount-on-restart.sh", - "root", - "/root", - "${templateOption:cloud}", - "${templateOption:login}" - ], - "features": { - "./.devcontainer/features/workbench-tools": { - "cloud": "${templateOption:cloud}", - "username": "root", - "userHomeDir": "/root" - } - }, - "remoteUser": "root" -} -``` +--- -**When using Pattern 2, the Folder field in Workbench UI should be `src/my-app`** +## Pattern 3: Full-Featured (Monorepo) + +For apps needing `wb` CLI, bucket mounting, gcloud auth. + +1. **Fork** https://github.com/verily-src/workbench-app-devcontainers +2. Run: `./scripts/create-custom-app.sh my-app python:3.11-slim 8080` +3. App created at `src/my-app/` +4. In Workbench, set **Folder** to `src/my-app` --- -## ⚠️ Workbench App URLs (CRITICAL) +## ⚠️ Critical Requirements -**When accessing your app, MUST use this format:** +- [ ] `.devcontainer.json` at repo ROOT +- [ ] `container_name: "application-server"` +- [ ] `networks: app-network` with `external: true` +- [ ] Server binds to `0.0.0.0` (not `localhost`) -``` -https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] -``` +--- + +## ⚠️ Workbench App URLs + +**Format:** `https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/` -### Get App UUID: ```bash +# Get App UUID wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 ``` -### ❌ WRONG Formats -``` -https://abc123-def456.workbench-app.verily.com/ ← WRONG -http://localhost:8080/ ← WRONG -``` - ---- - -## Common Errors and Fixes - -| Error | Cause | Fix | -|-------|-------|-----| -| App fails to create / No container | `.devcontainer.json` in wrong location | Move to repo ROOT | -| App fails to create | Missing `startupscript/` in monorepo | Use Pattern 1, or fork official repo | -| Container restart loop | Process exits immediately | Ensure server runs continuously | -| Server not accessible | Bound to `localhost` | Change to `host='0.0.0.0'` | -| "Bad Request" error | Wrong URL format | Use proxy URL format | -| Features not found | Using Pattern 2 without monorepo structure | Use Pattern 1 for standalone apps | +**❌ Wrong:** `https://abc123.workbench-app.verily.com/` --- -## Flask App Example (Pattern 1) +## Flask App Example -**app.py:** ```python from flask import Flask from flask_cors import CORS @@ -258,39 +209,23 @@ if __name__ == '__main__': app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) ``` -**requirements.txt:** -``` -flask>=3.0.0 -flask-cors>=4.0.0 -``` - --- -## Streamlit Example (Pattern 1) +## Streamlit Example -**docker-compose.yaml:** ```yaml +# docker-compose.yaml services: app: container_name: "application-server" image: "python:3.11-slim" - restart: always - working_dir: /workspace command: > - bash -c "pip install -r requirements.txt && + bash -c "pip install streamlit && streamlit run app.py --server.port=8501 --server.address=0.0.0.0" - volumes: - - .:/workspace:cached ports: - 8501:8501 networks: - app-network - cap_add: - - SYS_ADMIN - devices: - - /dev/fuse - security_opt: - - apparmor:unconfined networks: app-network: @@ -304,19 +239,15 @@ networks: In Workbench UI: - **Repository:** `https://github.com/YOUR-ORG/YOUR-REPO.git` - **Branch:** `main` -- **Folder:** `.` (Pattern 1) or `src/my-app` (Pattern 2) +- **Folder:** `.` (standalone) or `src/my-app` (monorepo) --- ## Local Testing ```bash -# Create required network docker network create app-network - -# Build and run docker compose up --build - # Access at http://localhost:PORT ``` @@ -324,29 +255,21 @@ docker compose up --build ## Reference Implementations -| App | Pattern | Port | Description | -|-----|---------|------|-------------| -| [simple-dashboard-app](https://github.com/aculotti-verily/simple-dashboard-app) | 1 | 8501 | Streamlit dashboard | -| [r-shiny-demo-app](https://github.com/aculotti-verily/r-shiny-demo-app) | 1 | 8080 | RShiny with Caddy | -| [playground](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/playground) | 1 | 8080 | Multi-service example | -| [workbench-jupyter](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/workbench-jupyter-docker) | 2 | 8888 | Full JupyterLab | -| [r-analysis](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/r-analysis) | 2 | 8787 | RStudio with features | +| App | Pattern | Source | +|-----|---------|--------| +| clinical-abstraction-demo | Minimal | [PeterSu92/workbench-app-devcontainers](https://github.com/PeterSu92/workbench-app-devcontainers/tree/yp_ac_clin/src/clinical-abstraction-demo) | +| simple-dashboard-app | Image + command | [aculotti-verily/simple-dashboard-app](https://github.com/aculotti-verily/simple-dashboard-app) | +| r-shiny-demo-app | Caddy proxy | [aculotti-verily/r-shiny-demo-app](https://github.com/aculotti-verily/r-shiny-demo-app) | +| playground | Minimal | [verily-src/workbench-app-devcontainers](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/playground) | +| shiny-aws-ce | Full-featured | [verily-src/workbench-app-devcontainers](https://github.com/verily-src/workbench-app-devcontainers/tree/nbense/BENCH-6958/src/shiny-aws-ce) | --- -## Decision Flowchart +## Common Errors -``` -Do you need wb CLI, bucket mounting, or gcloud auth? - │ - ├── NO → Use Pattern 1 (Simple Standalone) - │ - Create single repo - │ - .devcontainer.json at ROOT - │ - No features, no startup scripts - │ - └── YES → Use Pattern 2 (Full-Featured Monorepo) - - Fork verily-src/workbench-app-devcontainers - - Run ./scripts/create-custom-app.sh - - App goes in src/my-app/ - - Folder field = "src/my-app" -``` +| Error | Possible Cause | +|-------|---------------| +| App fails to create | `.devcontainer.json` in wrong location | +| No container created | Check Workbench logs, GitHub access | +| Container restart loop | App crashes on startup (check `docker logs`) | +| "Bad Request" | Wrong URL format | From 46a73f36e726dc5859570f08829a6ca39243b6be Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 19 Feb 2026 20:40:57 -0500 Subject: [PATCH 34/86] Fix: Ensure workspaceBaseURL is never empty Bug: If wb status succeeded but JSON parsing failed or server info was missing, workspaceBaseURL stayed empty, causing 'unsupported protocol scheme' errors in API calls. Fix: Set default URLs first, then try to update from wb status. Added final safety check to ensure URLs are never empty. Added warning logs for debugging when defaults are used. --- features/src/wb-mcp-server/main.go | 40 ++++++++++++++++++------------ 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index 3b0c0fc53..be88fae48 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -1521,31 +1521,39 @@ WORKFLOW: } func initializeConfig() error { + // Default to production Verily URLs + workspaceBaseURL = "https://workbench.verily.com/api/wsm" + dataExplorerURL = "https://workbench.verily.com/api/de" + cmd := exec.Command("wb", "status", "--format=json") output, err := cmd.CombinedOutput() if err != nil { - // Fallback to production Verily URLs - workspaceBaseURL = "https://workbench.verily.com/api/wsm" - dataExplorerURL = "https://workbench.verily.com/api/de" + fmt.Fprintf(os.Stderr, "Warning: wb status failed, using default URLs: %v\n", err) } else { var status map[string]interface{} - if err := json.Unmarshal(output, &status); err == nil { - if server, ok := status["server"].(map[string]interface{}); ok { - // Get workspaceManagerUri from wb status output - if wsURL, ok := server["workspaceManagerUri"].(string); ok { - workspaceBaseURL = wsURL - // Derive dataExplorerUri from workspaceManagerUri - // Pattern: replace /api/wsm with /api/de - dataExplorerURL = strings.Replace(wsURL, "/api/wsm", "/api/de", 1) - } else { - // Fallback to production Verily URLs - workspaceBaseURL = "https://workbench.verily.com/api/wsm" - dataExplorerURL = "https://workbench.verily.com/api/de" - } + if err := json.Unmarshal(output, &status); err != nil { + fmt.Fprintf(os.Stderr, "Warning: failed to parse wb status JSON, using default URLs: %v\n", err) + } else if server, ok := status["server"].(map[string]interface{}); ok { + // Get workspaceManagerUri from wb status output + if wsURL, ok := server["workspaceManagerUri"].(string); ok && wsURL != "" { + workspaceBaseURL = wsURL + // Derive dataExplorerUri from workspaceManagerUri + // Pattern: replace /api/wsm with /api/de + dataExplorerURL = strings.Replace(wsURL, "/api/wsm", "/api/de", 1) } + } else { + fmt.Fprintf(os.Stderr, "Warning: server info not found in wb status, using default URLs\n") } } + // Final safety check - ensure URLs are never empty + if workspaceBaseURL == "" { + workspaceBaseURL = "https://workbench.verily.com/api/wsm" + } + if dataExplorerURL == "" { + dataExplorerURL = "https://workbench.verily.com/api/de" + } + fmt.Fprintf(os.Stderr, "Initialized - Workspace: %s, DataExplorer: %s\n", workspaceBaseURL, dataExplorerURL) return nil } From 10e22fe3dc688cfb9b2431276238847123491592 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Fri, 20 Feb 2026 12:15:31 -0500 Subject: [PATCH 35/86] Remove private repo references, use official repo only - Updated CUSTOM_APP.md to reference only verily-src/workbench-app-devcontainers - Removed references to aculotti-verily, PeterSu92 private repos - Updated generate-context.sh to use official repo for examples - Reference implementations now point to official master branch apps --- features/src/llm-context/generate-context.sh | 27 +++++++++---------- features/src/llm-context/skills/CUSTOM_APP.md | 26 +++++++++--------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 54fb796f8..9ed5981a8 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -402,28 +402,27 @@ SKILL_EOF --- -## Template Location +## App Examples Location -All templates are at: +Official app examples are at: ``` -https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/templates-only/src/templates/ +https://github.com/verily-src/workbench-app-devcontainers/tree/master/src ``` --- -## How to Use a Template +## How to Create a Custom App -### Option 1: Deploy Directly -``` -Repository: https://github.com/aculotti-verily/wb-app-mcp-and-context.git -Branch: templates-only -Folder: src/templates/ +### Option 1: Use Quick Start Script +```bash +# Fork the official repo, then: +./scripts/create-custom-app.sh my-app python:3.11-slim 8080 ``` ### Option 2: Copy and Customize -1. Copy the template folder to user's repo -2. Modify application code in `app/` -3. Update `devcontainer-template.json` with new name/description +1. Fork https://github.com/verily-src/workbench-app-devcontainers +2. Copy an existing app folder from `src/` (e.g., `example/`) +3. Modify the configuration and code 4. Push to GitHub and deploy --- @@ -1406,8 +1405,8 @@ file:///home/jupyter/dashboard.html ← JavaScript blocked - Deployment checklist ### Quick Reference -- **Templates**: https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/templates-only/src/templates/ -- **Full-featured apps**: https://github.com/verily-src/workbench-app-devcontainers +- **Official app examples**: https://github.com/verily-src/workbench-app-devcontainers/tree/master/src +- **Quick start script**: https://github.com/verily-src/workbench-app-devcontainers/blob/master/scripts/create-custom-app.sh --- diff --git a/features/src/llm-context/skills/CUSTOM_APP.md b/features/src/llm-context/skills/CUSTOM_APP.md index 9b1ac34a0..c7afb5367 100644 --- a/features/src/llm-context/skills/CUSTOM_APP.md +++ b/features/src/llm-context/skills/CUSTOM_APP.md @@ -9,14 +9,14 @@ | Pattern | Use When | Example | |---------|----------|---------| -| **Minimal (Standalone)** | Simple apps, no cloud resources | `clinical-abstraction-demo` | +| **Minimal (Standalone)** | Simple apps, no cloud resources | `example` app in official repo | | **Full-Featured (Monorepo)** | Need `wb` CLI, bucket mounting | Fork official repo | --- ## Pattern 1: Minimal Standalone App -Based on working examples: `clinical-abstraction-demo`, `simple-dashboard-app` +Based on the `example` app in the [official repo](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/example). ### File Structure ``` @@ -43,7 +43,7 @@ your-repo/ ### File 2: `docker-compose.yaml` -**Minimal (from clinical-abstraction-demo):** +**Minimal pattern:** ```yaml services: app: @@ -62,7 +62,7 @@ networks: external: true ``` -**Alternative: Use image directly (from simple-dashboard-app):** +**Alternative: Use image directly (no Dockerfile):** ```yaml services: app: @@ -125,7 +125,7 @@ CMD ["python", "app.py"] ## Pattern 2: Multi-Container with Caddy Proxy -Based on `r-shiny-demo-app` - useful when your app needs a reverse proxy. +Useful when your app needs a reverse proxy. See the [r-analysis](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/r-analysis) app for an RStudio example with startup scripts. ```yaml services: @@ -255,13 +255,15 @@ docker compose up --build ## Reference Implementations -| App | Pattern | Source | -|-----|---------|--------| -| clinical-abstraction-demo | Minimal | [PeterSu92/workbench-app-devcontainers](https://github.com/PeterSu92/workbench-app-devcontainers/tree/yp_ac_clin/src/clinical-abstraction-demo) | -| simple-dashboard-app | Image + command | [aculotti-verily/simple-dashboard-app](https://github.com/aculotti-verily/simple-dashboard-app) | -| r-shiny-demo-app | Caddy proxy | [aculotti-verily/r-shiny-demo-app](https://github.com/aculotti-verily/r-shiny-demo-app) | -| playground | Minimal | [verily-src/workbench-app-devcontainers](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/playground) | -| shiny-aws-ce | Full-featured | [verily-src/workbench-app-devcontainers](https://github.com/verily-src/workbench-app-devcontainers/tree/nbense/BENCH-6958/src/shiny-aws-ce) | +All examples are from the official repo: [verily-src/workbench-app-devcontainers](https://github.com/verily-src/workbench-app-devcontainers) + +| App | Pattern | Description | +|-----|---------|-------------| +| [example](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/example) | Minimal | Reference implementation using ttyd terminal | +| [workbench-jupyter](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/workbench-jupyter) | Full-featured | JupyterLab with Workbench integration | +| [r-analysis](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/r-analysis) | Full-featured | RStudio with startup scripts | +| [workbench-vscode](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/workbench-vscode) | Full-featured | VS Code Server in browser | +| [playground](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/playground) | Minimal | Simple base environment | --- From 6b16473b04c982ec355d124aebc290d8165ead6d Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Fri, 20 Feb 2026 12:19:49 -0500 Subject: [PATCH 36/86] Add VM troubleshooting section to CUSTOM_APP.md Added guidance for debugging app startup failures via SSH: - journalctl logs for devcontainer.service - /home/core/ startup scripts directory - /etc/systemd/system service definitions - Container status and logs - Common error messages and fixes --- features/src/llm-context/skills/CUSTOM_APP.md | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/features/src/llm-context/skills/CUSTOM_APP.md b/features/src/llm-context/skills/CUSTOM_APP.md index c7afb5367..f1e7171a4 100644 --- a/features/src/llm-context/skills/CUSTOM_APP.md +++ b/features/src/llm-context/skills/CUSTOM_APP.md @@ -275,3 +275,68 @@ All examples are from the official repo: [verily-src/workbench-app-devcontainers | No container created | Check Workbench logs, GitHub access | | Container restart loop | App crashes on startup (check `docker logs`) | | "Bad Request" | Wrong URL format | + +--- + +## 🔧 Troubleshooting (SSH into VM) + +When an app fails to start, SSH into the VM and run these commands: + +### 1. Check Startup Scripts & Logs +```bash +# View devcontainer service logs (MOST IMPORTANT) +sudo journalctl -u devcontainer.service --no-pager | tail -100 + +# Check failure count +cat /tmp/devcontainer-failure-count 2>/dev/null + +# Check error message set by Workbench +curl -s -H "Metadata-Flavor: Google" \ + http://metadata.google.internal/computeMetadata/v1/instance/guest-attributes/startup_script/message +``` + +### 2. Check Startup Script Directory +```bash +# Workbench startup scripts live here +ls -la /home/core/ + +# Key scripts to check: +# - git-clone-devcontainer.sh (clones your repo) +# - docker-auth.sh (sets up Docker registry auth) +# - parse-devcontainer.sh (parses .devcontainer.json) +# - devcontainer.sh (builds and runs container) +``` + +### 3. Check Systemd Services +```bash +# View the devcontainer service definition +systemctl cat devcontainer.service + +# Check service status +systemctl status devcontainer.service +systemctl status proxy-readiness.service + +# List all relevant services +systemctl list-units --type=service | grep -i "devcontainer\|docker" +``` + +### 4. Check Container Status +```bash +# List all containers (including stopped) +docker ps -a + +# Check container logs +docker logs application-server 2>&1 | tail -50 + +# Check if repo was cloned +ls -la /home/core/devcontainer/ +``` + +### 5. Common Issues Found in Logs + +| Log Message | Cause | Fix | +|-------------|-------|-----| +| `docker-auth.sh: path parameter is required` | Workbench startup bug | Wait for fix or manual startup | +| `Failed to clone devcontainer GitHub repo` | GitHub access issue | Check repo permissions | +| `Container exited with code 1` | App crash | Check `docker logs application-server` | +| `proxy-agent or application-server is not started` | Container never started | Check earlier logs | From 579153801d0d1c3e1a039ec50c72a146b3d0444b Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Mon, 2 Mar 2026 15:42:28 -0500 Subject: [PATCH 37/86] Make MCP tools more prominent in CLAUDE.md context - Add 'MCP Tools First!' section at top of document - Update Data Exploration Cheatsheet to show MCP tools first - Redesign LLM Quick Patterns table with MCP/CLI column - Add 'Common Operations - USE MCP, NOT CLI' warning table - Update MCP Tools Available table with correct tool names - Address feedback that Claude was defaulting to CLI instead of MCP --- features/src/llm-context/generate-context.sh | 103 ++++++++++++++----- 1 file changed, 75 insertions(+), 28 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 9ed5981a8..64bed0df1 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -402,27 +402,28 @@ SKILL_EOF --- -## App Examples Location +## Template Location -Official app examples are at: +All templates are at: ``` -https://github.com/verily-src/workbench-app-devcontainers/tree/master/src +https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/templates-only/src/templates/ ``` --- -## How to Create a Custom App +## How to Use a Template -### Option 1: Use Quick Start Script -```bash -# Fork the official repo, then: -./scripts/create-custom-app.sh my-app python:3.11-slim 8080 +### Option 1: Deploy Directly +``` +Repository: https://github.com/aculotti-verily/wb-app-mcp-and-context.git +Branch: templates-only +Folder: src/templates/ ``` ### Option 2: Copy and Customize -1. Fork https://github.com/verily-src/workbench-app-devcontainers -2. Copy an existing app folder from `src/` (e.g., `example/`) -3. Modify the configuration and code +1. Copy the template folder to user's repo +2. Modify application code in `app/` +3. Update `devcontainer-template.json` with new name/description 4. Push to GitHub and deploy --- @@ -956,6 +957,23 @@ You are working inside **Verily Workbench**, a secure cloud-based research envir --- +## ⚡ MCP Tools First! + +> **Before running ANY CLI command, check if an MCP tool exists for the operation.** +> MCP tools return structured JSON and are faster than parsing CLI output. + +| Common Task | ✅ Use This MCP Tool | +|-------------|---------------------| +| List data collections | \`workspace_list_data_collections\` | +| List resources | \`workspace_list_resources\` | +| Resources by folder | \`resource_list_tree\` | +| Query BigQuery | \`bq_execute\` | +| List bucket files | \`list_files\` | + +**Skip to:** [Data Exploration Cheatsheet](#-data-exploration-cheatsheet) | [MCP Tools](#mcp-tools-available) + +--- + ## What is Verily Workbench? Verily Workbench is a platform that enables researchers to: @@ -1109,7 +1127,18 @@ gs://your-bucket/ This is the **most important section** for quickly discovering and accessing data. +> **⚡ MCP FIRST:** Always check if an MCP tool exists before using CLI commands. MCP tools return structured data and are faster. + ### Step 1: Find Your Resources + +**🎯 Use MCP tools (preferred):** +| What You Need | MCP Tool | +|---------------|----------| +| Data collections + their resources | \`workspace_list_data_collections\` | +| All resources (flat list) | \`workspace_list_resources\` | +| Resources organized by folder | \`resource_list_tree\` | + +**CLI fallback:** \`\`\`bash wb resource list --format=json | jq '.[] | {name: .id, type: .resourceType}' \`\`\` @@ -1145,13 +1174,19 @@ gsutil cat -r 0-1024 gs:///file.csv # Preview first 1KB ### 🤖 LLM Quick Patterns -| Question | Command | -|----------|---------| -| "What data is available?" | \`wb resource list\` | -| "What tables in dataset?" | \`bq ls :\` | -| "What columns in table?" | \`bq show --schema :.
\` | -| "How big is this table?" | \`bq show --format=prettyjson ... \\| jq '{rows: .numRows}'\` | -| "Show sample data" | \`bq head -n 5 :.
\` | +| User Question | Best Tool | Command/Tool | +|---------------|-----------|--------------| +| "What data collections do I have?" | **MCP** | \`workspace_list_data_collections\` | +| "What resources are in my workspace?" | **MCP** | \`workspace_list_resources\` | +| "Show resources by folder" | **MCP** | \`resource_list_tree\` | +| "Query this BigQuery table" | **MCP** | \`bq_execute\` | +| "What tables are in this dataset?" | CLI | \`bq ls :\` | +| "What columns in this table?" | CLI | \`bq show --schema :.
\` | +| "How big is this table?" | CLI | \`bq show --format=prettyjson ... \\| jq '{rows: .numRows}'\` | +| "Show me sample data" | CLI | \`bq head -n 5 :.
\` | +| "List files in bucket" | **MCP** | \`list_files\` | + +> **⚠️ Pattern to avoid:** Don't default to \`wb resource list\` for data collection questions. Use \`workspace_list_data_collections\` instead! --- @@ -1245,21 +1280,33 @@ This app has **two interfaces** to Workbench functionality: | **MCP Tools** | LLM operations | Structured responses, no shell needed, faster | Limited tool set | | **CLI (\`wb\`)** | Complex operations, fallback | Full feature coverage, human-friendly | Requires shell execution, text parsing | +### ⚠️ Common Operations — USE MCP, NOT CLI + +These operations have dedicated MCP tools. **Do NOT use CLI for these:** + +| Operation | ✅ Use MCP Tool | ❌ Don't Use CLI | +|-----------|-----------------|------------------| +| List data collections | \`workspace_list_data_collections\` | ~~\`wb resource list\`~~ | +| List all resources | \`workspace_list_resources\` | ~~\`wb resource list\`~~ | +| Resources by folder | \`resource_list_tree\` | ~~\`wb resource list-tree\`~~ | +| Run BigQuery query | \`bq_execute\` | ~~\`bq query\`~~ | +| List bucket files | \`list_files\` | ~~\`gsutil ls\`~~ | + ### 🤖 LLM Decision Guide -1. **Prefer MCP tools** when the operation is supported — they return structured data and don't require shell execution -2. **Fall back to CLI** when MCP doesn't have the tool, or for complex/chained operations -3. **Use cloud CLIs directly** (\`gsutil\`, \`bq\`, \`gcloud\`) for low-level cloud operations +1. **ALWAYS check MCP tools first** — especially for list/query operations +2. **Fall back to CLI only** when MCP doesn't have the tool +3. **Use cloud CLIs** (\`gsutil\`, \`bq\`) only for operations MCP doesn't support ### Example: Same Operation, Two Ways **List resources:** -- MCP: Use \`workspace_list_resources\` tool → returns JSON array -- CLI: Run \`wb resource list --format=json\` → parse stdout +- ✅ MCP: Use \`workspace_list_resources\` tool → returns JSON array +- ⚠️ CLI: Run \`wb resource list --format=json\` → requires shell, parsing **Query BigQuery:** -- MCP: Use \`bq_execute\` tool with query parameter → returns results -- CLI: Run \`bq query --use_legacy_sql=false 'SELECT ...'\` → parse output +- ✅ MCP: Use \`bq_execute\` tool with query parameter → returns results +- ⚠️ CLI: Run \`bq query --use_legacy_sql=false 'SELECT ...'\` → requires parsing --- @@ -1269,8 +1316,8 @@ The Workbench MCP server exposes these tools for programmatic LLM access: | MCP Tool | CLI Equivalent | Description | |----------|----------------|-------------| +| \`workspace_list_data_collections\` | N/A | **List data collections and their resources** | | \`workspace_list_resources\` | \`wb resource list\` | List all resources in the workspace | -| \`workspace_list_data_collections\` | N/A | List data collections and their resources | | \`resource_list_tree\` | \`wb resource list-tree\` | List resources organized by folder | | \`bq_execute\` | \`bq query\` | Run SQL queries against BigQuery | | \`workflow_job_run\` | \`wb workflow run\` | Submit a WDL/Nextflow workflow | @@ -1405,8 +1452,8 @@ file:///home/jupyter/dashboard.html ← JavaScript blocked - Deployment checklist ### Quick Reference -- **Official app examples**: https://github.com/verily-src/workbench-app-devcontainers/tree/master/src -- **Quick start script**: https://github.com/verily-src/workbench-app-devcontainers/blob/master/scripts/create-custom-app.sh +- **Templates**: https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/templates-only/src/templates/ +- **Full-featured apps**: https://github.com/verily-src/workbench-app-devcontainers --- From fa1d5e82f640df1062f188fd93c866afcfa1894d Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 17 Mar 2026 12:44:35 -0400 Subject: [PATCH 38/86] Add WORKFLOW_TROUBLESHOOT skill for debugging failed WDL workflows - Add embedded WORKFLOW_TROUBLESHOOT.md skill in generate-context.sh - Covers: job identification, log pulling, resource checks, error diagnosis - Includes quick reference commands and error-to-fix lookup table - Add skill to Available Skills table and trigger guide in CLAUDE.md template --- features/src/llm-context/generate-context.sh | 321 ++++++++++++++++++ .../skills/WORKFLOW_TROUBLESHOOT.md | 310 +++++++++++++++++ 2 files changed, 631 insertions(+) create mode 100644 features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 64bed0df1..a611ca908 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -820,6 +820,321 @@ python3 app.py & - ❌ Forgetting to restart server after code changes - ❌ Not checking server logs when debugging DASHBOARD_SKILL_EOF + + # Create WORKFLOW_TROUBLESHOOT.md skill (full version, embedded) + log_info "Creating WORKFLOW_TROUBLESHOOT.md skill..." + cat > "${SKILLS_DIR}/WORKFLOW_TROUBLESHOOT.md" << 'WORKFLOW_SKILL_EOF' +# WDL Workflow Troubleshooting Skill + +**Trigger:** User asks to troubleshoot, debug, or fix a failed workflow. + +--- + +## Quick Diagnosis (Start Here) + +\`\`\`bash +# 1. Find failed jobs +wb workflow job list --format=json | jq -r '.[] | select(.status=="FAILED") | "\(.id)\t\(.workflowName)\t\(.startTime)"' + +# 2. Get error message (replace JOB_ID) +wb workflow job describe --job= --format=json | jq -r '.failureMessage // "No message"' + +# 3. Find failed task +wb workflow job task list --job= --format=json | jq -r '.[] | select(.status=="FAILED") | .name' + +# 4. Get task error + logs +wb workflow job task describe --job= --task= --format=json | jq '{stderr, stdout, exitCode, failureMessage}' +\`\`\` + +**After running these 4 commands, you'll know:** which job failed, why, which task, and where logs are. + +--- + +## Step-by-Step Guide + +### Step 1: Identify Failed Job + +\`\`\`bash +# List all failed jobs +wb workflow job list --format=json | jq '.[] | select(.status == "FAILED") | {id, workflowName, status, startTime, endTime}' +\`\`\` + +**For batch jobs:** +\`\`\`bash +# List failed sub-jobs within a batch +wb workflow job batch list --job= --format=json | jq '.[] | select(.status == "FAILED") | {id, status}' +\`\`\` + +**Ask user:** Confirm which job ID to investigate. + +--- + +### Step 2: Get Job Details & Inputs + +\`\`\`bash +# Full job metadata +wb workflow job describe --job= --format=json +\`\`\` + +**Key fields to extract:** +\`\`\`bash +# Error message +wb workflow job describe --job= --format=json | jq -r '.failureMessage' + +# Inputs used +wb workflow job describe --job= --format=json | jq '.inputs' + +# Outputs (if any) +wb workflow job describe --job= --format=json | jq '.outputs' +\`\`\` + +--- + +### Step 3: Find Failed Task & Get Logs + +\`\`\`bash +# List all tasks with status +wb workflow job task list --job= --format=json | jq '.[] | {name, status, exitCode}' + +# Get failed task details +wb workflow job task describe --job= --task= --format=json +\`\`\` + +**Extract log URLs:** +\`\`\`bash +# Get stderr and stdout URLs +TASK_INFO=\$(wb workflow job task describe --job= --task= --format=json) +STDERR_URL=\$(echo \$TASK_INFO | jq -r '.stderr') +STDOUT_URL=\$(echo \$TASK_INFO | jq -r '.stdout') + +echo "stderr: \$STDERR_URL" +echo "stdout: \$STDOUT_URL" +\`\`\` + +--- + +### Step 4: Pull and Analyze Task Logs + +#### Read Log Contents + +\`\`\`bash +# Read stderr (usually contains errors) +gsutil cat "\$STDERR_URL" 2>/dev/null | tail -100 + +# Read stdout +gsutil cat "\$STDOUT_URL" 2>/dev/null | tail -100 + +# Search for common error patterns +gsutil cat "\$STDERR_URL" 2>/dev/null | grep -i -E "error|exception|failed|denied|killed|oom|memory|disk|timeout" | head -30 +\`\`\` + +#### Common Log File Patterns + +Cromwell execution logs are typically at: +\`\`\` +gs://///execution/ +├── stdout # Task standard output +├── stderr # Task standard error +├── script # The actual command that ran +├── rc # Return code (exit code) +└── script.submit # Submission script +\`\`\` + +**One-liner to read all execution files:** +\`\`\`bash +# Find execution directory from task describe, then: +EXEC_DIR=\$(echo \$TASK_INFO | jq -r '.executionDirectory // empty') +if [ -n "\$EXEC_DIR" ]; then + echo "=== script ===" && gsutil cat "\$EXEC_DIR/script" 2>/dev/null + echo "=== rc ===" && gsutil cat "\$EXEC_DIR/rc" 2>/dev/null + echo "=== stderr (last 50 lines) ===" && gsutil cat "\$EXEC_DIR/stderr" 2>/dev/null | tail -50 +fi +\`\`\` + +--- + +### Step 5: Check Resource Allocation & Usage + +#### What Was Requested (from WDL runtime) + +\`\`\`bash +# Get workflow definition to see runtime requirements +wb workflow describe --workflow= --format=json | jq '.sourceUrl' + +# Read WDL file +gsutil cat gs:////workflow.wdl | grep -A10 "runtime {" +\`\`\` + +#### Check Actual Resource Usage (GCP Batch) + +\`\`\`bash +# For GCP Cromwell jobs, get batch job details +gcloud batch jobs list --filter="status.state=FAILED" --format="table(name,status.state,createTime)" + +# Describe specific batch job +gcloud batch jobs describe --format=json | jq '{ + status: .status.state, + statusEvents: .status.statusEvents, + taskGroups: .taskGroups[0].taskSpec.computeResource +}' +\`\`\` + +#### Memory-Specific Checks + +\`\`\`bash +# Check if OOM (Out of Memory) killed the task +gsutil cat "\$STDERR_URL" 2>/dev/null | grep -i -E "oom|out of memory|killed|cannot allocate|memory" + +# Check what memory was requested in batch job +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource.memoryMib' + +# Check dmesg/syslog for OOM events (if available in logs) +gsutil cat "\$STDERR_URL" 2>/dev/null | grep -i "killed process" +\`\`\` + +--- + +### Step 6: Diagnose by Error Type + +#### Memory Issues (OOM) + +**Symptoms:** +- Exit code 137 (SIGKILL) or 143 +- "Killed" in stderr +- "Cannot allocate memory" +- Task succeeded locally but fails at scale + +**Diagnosis:** +\`\`\`bash +# Check requested memory +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource' + +# Look for memory errors in logs +gsutil cat "\$STDERR_URL" 2>/dev/null | grep -i -E "memory|oom|killed|malloc" +\`\`\` + +**Fix:** Increase \`memory\` in WDL runtime block: +\`\`\`wdl +runtime { + memory: "32G" # Increase from previous value +} +\`\`\` + +#### Disk Issues + +**Symptoms:** +- "No space left on device" +- "Disk quota exceeded" + +**Diagnosis:** +\`\`\`bash +gsutil cat "\$STDERR_URL" 2>/dev/null | grep -i -E "space|disk|quota" +\`\`\` + +**Fix:** Increase disk in WDL runtime: +\`\`\`wdl +runtime { + disks: "local-disk 200 SSD" # Increase size +} +\`\`\` + +#### Input File Issues + +**Symptoms:** +- "FileNotFoundException" +- "Localization failed" +- File not found errors + +**Diagnosis:** +\`\`\`bash +# Check if input files exist +wb workflow job describe --job= --format=json | jq -r '.inputs | to_entries[] | .value' | while read path; do + if [[ \$path == gs://* ]]; then + echo -n "\$path: " && gsutil ls "\$path" 2>&1 | head -1 + fi +done +\`\`\` + +#### Permission Issues + +**Symptoms:** +- "Permission denied" +- "Access denied" +- 403 errors + +**Diagnosis:** +\`\`\`bash +# Check service account permissions +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.serviceAccount' + +# Test bucket access +gsutil ls gs:/// 2>&1 | head -5 +\`\`\` + +--- + +### Step 7: Propose Solution + +Based on diagnosis, recommend one of: + +| Issue | Solution Template | +|-------|-------------------| +| **OOM** | "Increase memory from X to Y in the runtime block" | +| **Disk full** | "Increase disk size from X to Y GB" | +| **Missing input** | "Input file doesn't exist. Verify path: \`gsutil ls \`" | +| **Permission** | "Service account lacks access. Grant \`roles/storage.objectViewer\` on bucket" | +| **Timeout** | "Task exceeded time limit. Increase \`maxRetries\` or optimize task" | +| **Docker** | "Image pull failed. Verify image exists and is accessible" | + +**Re-run after fixing:** +\`\`\`bash +wb workflow job run --workflow= --inputs= +\`\`\` + +--- + +## Quick Reference + +### Essential Commands + +\`\`\`bash +# Failed jobs +wb workflow job list --format=json | jq '.[] | select(.status=="FAILED") | {id, workflowName}' + +# Job error +wb workflow job describe --job= --format=json | jq '.failureMessage' + +# Failed tasks +wb workflow job task list --job= --format=json | jq '.[] | select(.status=="FAILED") | .name' + +# Task logs +wb workflow job task describe --job= --task= --format=json | jq '.stderr' | xargs -I{} gsutil cat {} | tail -50 + +# Memory check +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource' +\`\`\` + +### Error → Cause → Fix + +| Exit Code | Meaning | Common Fix | +|-----------|---------|------------| +| 1 | General error | Check stderr for details | +| 2 | Misuse of command | Check script syntax | +| 126 | Permission problem | Check file permissions | +| 127 | Command not found | Check PATH, container image | +| 137 | SIGKILL (OOM) | **Increase memory** | +| 139 | Segfault | Check input data, memory | +| 143 | SIGTERM | Task timeout or preemption | + +--- + +## Workbench-Specific Notes + +- **Log retention:** Cromwell logs persist in workspace execution bucket +- **Batch jobs:** Each sub-job has independent logs; troubleshoot specific failed sub-job +- **VPC-SC:** Run \`gcloud batch\` commands from within workspace app +- **Preemption:** If using spot VMs, set \`preemptible: 0\` for reliability +WORKFLOW_SKILL_EOF } # Fetch workspace information @@ -1465,6 +1780,7 @@ When users ask about specific topics, **read these skill files** for detailed gu |-------|------------|-------------| | **🚨 Dashboards, HTML, Flask, Web UIs** | \`~/.workbench/skills/DASHBOARD_BUILDER.md\` | **READ THIS FIRST** for any: dashboard, chart, visualization, Flask app, Streamlit, HTML page, web UI, interactive display, Plotly, or anything running on a port | | Building custom apps | \`~/.workbench/skills/CUSTOM_APP.md\` | User wants to build a deployable app from scratch | +| **Troubleshoot failed workflows** | \`~/.workbench/skills/WORKFLOW_TROUBLESHOOT.md\` | Debug WDL/Nextflow failures, pull logs, check memory/disk, identify root cause | ### ⚡ Skill Trigger Guide @@ -1481,6 +1797,11 @@ When users ask about specific topics, **read these skill files** for detailed gu - "build a deployable app" / "create a custom app" - "API service" / "backend" / "from scratch" +**Read WORKFLOW_TROUBLESHOOT.md when:** +- "my workflow failed" / "workflow error" / "debug workflow" +- "job failed" / "task failed" / "out of memory" +- "check logs" / "why did it fail" / "troubleshoot" + --- ## Quick Reference (Machine-Readable) diff --git a/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md new file mode 100644 index 000000000..9263289ee --- /dev/null +++ b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md @@ -0,0 +1,310 @@ +# WDL Workflow Troubleshooting Skill + +**Trigger:** User asks to troubleshoot, debug, or fix a failed workflow. + +--- + +## Quick Diagnosis (Start Here) + +```bash +# 1. Find failed jobs +wb workflow job list --format=json | jq -r '.[] | select(.status=="FAILED") | "\(.id)\t\(.workflowName)\t\(.startTime)"' + +# 2. Get error message (replace JOB_ID) +wb workflow job describe --job= --format=json | jq -r '.failureMessage // "No message"' + +# 3. Find failed task +wb workflow job task list --job= --format=json | jq -r '.[] | select(.status=="FAILED") | .name' + +# 4. Get task error + logs +wb workflow job task describe --job= --task= --format=json | jq '{stderr, stdout, exitCode, failureMessage}' +``` + +**After running these 4 commands, you'll know:** which job failed, why, which task, and where logs are. + +--- + +## Step-by-Step Guide + +### Step 1: Identify Failed Job + +```bash +# List all failed jobs +wb workflow job list --format=json | jq '.[] | select(.status == "FAILED") | {id, workflowName, status, startTime, endTime}' +``` + +**For batch jobs:** +```bash +# List failed sub-jobs within a batch +wb workflow job batch list --job= --format=json | jq '.[] | select(.status == "FAILED") | {id, status}' +``` + +**Ask user:** Confirm which job ID to investigate. + +--- + +### Step 2: Get Job Details & Inputs + +```bash +# Full job metadata +wb workflow job describe --job= --format=json +``` + +**Key fields to extract:** +```bash +# Error message +wb workflow job describe --job= --format=json | jq -r '.failureMessage' + +# Inputs used +wb workflow job describe --job= --format=json | jq '.inputs' + +# Outputs (if any) +wb workflow job describe --job= --format=json | jq '.outputs' +``` + +--- + +### Step 3: Find Failed Task & Get Logs + +```bash +# List all tasks with status +wb workflow job task list --job= --format=json | jq '.[] | {name, status, exitCode}' + +# Get failed task details +wb workflow job task describe --job= --task= --format=json +``` + +**Extract log URLs:** +```bash +# Get stderr and stdout URLs +TASK_INFO=$(wb workflow job task describe --job= --task= --format=json) +STDERR_URL=$(echo $TASK_INFO | jq -r '.stderr') +STDOUT_URL=$(echo $TASK_INFO | jq -r '.stdout') + +echo "stderr: $STDERR_URL" +echo "stdout: $STDOUT_URL" +``` + +--- + +### Step 4: Pull and Analyze Task Logs + +#### Read Log Contents + +```bash +# Read stderr (usually contains errors) +gsutil cat "$STDERR_URL" 2>/dev/null | tail -100 + +# Read stdout +gsutil cat "$STDOUT_URL" 2>/dev/null | tail -100 + +# Search for common error patterns +gsutil cat "$STDERR_URL" 2>/dev/null | grep -i -E "error|exception|failed|denied|killed|oom|memory|disk|timeout" | head -30 +``` + +#### Common Log File Patterns + +Cromwell execution logs are typically at: +``` +gs://///execution/ +├── stdout # Task standard output +├── stderr # Task standard error +├── script # The actual command that ran +├── rc # Return code (exit code) +└── script.submit # Submission script +``` + +**One-liner to read all execution files:** +```bash +# Find execution directory from task describe, then: +EXEC_DIR=$(echo $TASK_INFO | jq -r '.executionDirectory // empty') +if [ -n "$EXEC_DIR" ]; then + echo "=== script ===" && gsutil cat "$EXEC_DIR/script" 2>/dev/null + echo "=== rc ===" && gsutil cat "$EXEC_DIR/rc" 2>/dev/null + echo "=== stderr (last 50 lines) ===" && gsutil cat "$EXEC_DIR/stderr" 2>/dev/null | tail -50 +fi +``` + +--- + +### Step 5: Check Resource Allocation & Usage + +#### What Was Requested (from WDL runtime) + +```bash +# Get workflow definition to see runtime requirements +wb workflow describe --workflow= --format=json | jq '.sourceUrl' + +# Read WDL file +gsutil cat gs:////workflow.wdl | grep -A10 "runtime {" +``` + +#### Check Actual Resource Usage (GCP Batch) + +```bash +# For GCP Cromwell jobs, get batch job details +gcloud batch jobs list --filter="status.state=FAILED" --format="table(name,status.state,createTime)" + +# Describe specific batch job +gcloud batch jobs describe --format=json | jq '{ + status: .status.state, + statusEvents: .status.statusEvents, + taskGroups: .taskGroups[0].taskSpec.computeResource +}' +``` + +#### Memory-Specific Checks + +```bash +# Check if OOM (Out of Memory) killed the task +gsutil cat "$STDERR_URL" 2>/dev/null | grep -i -E "oom|out of memory|killed|cannot allocate|memory" + +# Check what memory was requested in batch job +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource.memoryMib' + +# Check dmesg/syslog for OOM events (if available in logs) +gsutil cat "$STDERR_URL" 2>/dev/null | grep -i "killed process" +``` + +--- + +### Step 6: Diagnose by Error Type + +#### Memory Issues (OOM) + +**Symptoms:** +- Exit code 137 (SIGKILL) or 143 +- "Killed" in stderr +- "Cannot allocate memory" +- Task succeeded locally but fails at scale + +**Diagnosis:** +```bash +# Check requested memory +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource' + +# Look for memory errors in logs +gsutil cat "$STDERR_URL" 2>/dev/null | grep -i -E "memory|oom|killed|malloc" +``` + +**Fix:** Increase `memory` in WDL runtime block: +```wdl +runtime { + memory: "32G" # Increase from previous value +} +``` + +#### Disk Issues + +**Symptoms:** +- "No space left on device" +- "Disk quota exceeded" + +**Diagnosis:** +```bash +gsutil cat "$STDERR_URL" 2>/dev/null | grep -i -E "space|disk|quota" +``` + +**Fix:** Increase disk in WDL runtime: +```wdl +runtime { + disks: "local-disk 200 SSD" # Increase size +} +``` + +#### Input File Issues + +**Symptoms:** +- "FileNotFoundException" +- "Localization failed" +- File not found errors + +**Diagnosis:** +```bash +# Check if input files exist +wb workflow job describe --job= --format=json | jq -r '.inputs | to_entries[] | .value' | while read path; do + if [[ $path == gs://* ]]; then + echo -n "$path: " && gsutil ls "$path" 2>&1 | head -1 + fi +done +``` + +#### Permission Issues + +**Symptoms:** +- "Permission denied" +- "Access denied" +- 403 errors + +**Diagnosis:** +```bash +# Check service account permissions +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.serviceAccount' + +# Test bucket access +gsutil ls gs:/// 2>&1 | head -5 +``` + +--- + +### Step 7: Propose Solution + +Based on diagnosis, recommend one of: + +| Issue | Solution Template | +|-------|-------------------| +| **OOM** | "Increase memory from X to Y in the runtime block" | +| **Disk full** | "Increase disk size from X to Y GB" | +| **Missing input** | "Input file doesn't exist. Verify path: `gsutil ls `" | +| **Permission** | "Service account lacks access. Grant `roles/storage.objectViewer` on bucket" | +| **Timeout** | "Task exceeded time limit. Increase `maxRetries` or optimize task" | +| **Docker** | "Image pull failed. Verify image exists and is accessible" | + +**Re-run after fixing:** +```bash +wb workflow job run --workflow= --inputs= +``` + +--- + +## Quick Reference + +### Essential Commands + +```bash +# Failed jobs +wb workflow job list --format=json | jq '.[] | select(.status=="FAILED") | {id, workflowName}' + +# Job error +wb workflow job describe --job= --format=json | jq '.failureMessage' + +# Failed tasks +wb workflow job task list --job= --format=json | jq '.[] | select(.status=="FAILED") | .name' + +# Task logs +wb workflow job task describe --job= --task= --format=json | jq '.stderr' | xargs -I{} gsutil cat {} | tail -50 + +# Memory check +gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource' +``` + +### Error → Cause → Fix + +| Exit Code | Meaning | Common Fix | +|-----------|---------|------------| +| 1 | General error | Check stderr for details | +| 2 | Misuse of command | Check script syntax | +| 126 | Permission problem | Check file permissions | +| 127 | Command not found | Check PATH, container image | +| 137 | SIGKILL (OOM) | **Increase memory** | +| 139 | Segfault | Check input data, memory | +| 143 | SIGTERM | Task timeout or preemption | + +--- + +## Workbench-Specific Notes + +- **Log retention:** Cromwell logs persist in workspace execution bucket +- **Batch jobs:** Each sub-job has independent logs; troubleshoot specific failed sub-job +- **VPC-SC:** Run `gcloud batch` commands from within workspace app +- **Preemption:** If using spot VMs, set `preemptible: 0` for reliability From ac70887e69d67316e0db9627664a3d4a47af7988 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 17 Mar 2026 13:53:08 -0400 Subject: [PATCH 39/86] Make WORKFLOW_TROUBLESHOOT proactive after job confirmation - Ask user to confirm job ID when multiple failed jobs exist - Once confirmed, run all diagnostics automatically (no asking) - Report findings with evidence, propose fix, then ask for guidance --- features/src/llm-context/generate-context.sh | 14 +++++++++++++- .../llm-context/skills/WORKFLOW_TROUBLESHOOT.md | 14 +++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index a611ca908..11ffe906a 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -828,6 +828,18 @@ DASHBOARD_SKILL_EOF **Trigger:** User asks to troubleshoot, debug, or fix a failed workflow. +## ⚡ LLM Behavior: Be Proactive! + +**Once the user confirms which job to investigate, DO NOT ask which diagnostic steps to run.** Instead: +1. **Run all diagnostic commands automatically** (Steps 2-4 at minimum) +2. **Analyze the results** and identify the root cause +3. **Report your diagnosis** with evidence (error messages, exit codes, log snippets) +4. **Propose a fix** with specific changes +5. **THEN ask** if they want you to apply the fix or investigate further + +❌ Don't say: "Would you like me to check the logs?" +✅ Do say: "I checked the logs and found an OOM error. The task requested 8GB but needed more. I recommend increasing memory to 16GB in the runtime block." + --- ## Quick Diagnosis (Start Here) @@ -865,7 +877,7 @@ wb workflow job list --format=json | jq '.[] | select(.status == "FAILED") | {id wb workflow job batch list --job= --format=json | jq '.[] | select(.status == "FAILED") | {id, status}' \`\`\` -**Ask user:** Confirm which job ID to investigate. +**Ask user:** Confirm which job ID to investigate (if multiple failed jobs). --- diff --git a/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md index 9263289ee..213e2d7c5 100644 --- a/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md +++ b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md @@ -2,6 +2,18 @@ **Trigger:** User asks to troubleshoot, debug, or fix a failed workflow. +## ⚡ LLM Behavior: Be Proactive! + +**Once the user confirms which job to investigate, DO NOT ask which diagnostic steps to run.** Instead: +1. **Run all diagnostic commands automatically** (Steps 2-4 at minimum) +2. **Analyze the results** and identify the root cause +3. **Report your diagnosis** with evidence (error messages, exit codes, log snippets) +4. **Propose a fix** with specific changes +5. **THEN ask** if they want you to apply the fix or investigate further + +❌ Don't say: "Would you like me to check the logs?" +✅ Do say: "I checked the logs and found an OOM error. The task requested 8GB but needed more. I recommend increasing memory to 16GB in the runtime block." + --- ## Quick Diagnosis (Start Here) @@ -39,7 +51,7 @@ wb workflow job list --format=json | jq '.[] | select(.status == "FAILED") | {id wb workflow job batch list --job= --format=json | jq '.[] | select(.status == "FAILED") | {id, status}' ``` -**Ask user:** Confirm which job ID to investigate. +**Ask user:** Confirm which job ID to investigate (if multiple failed jobs). --- From 577ef62521912d5622da868c12d83bdd204b564c Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 17 Mar 2026 15:18:12 -0400 Subject: [PATCH 40/86] Remove references to private templates repo --- features/src/llm-context/generate-context.sh | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 11ffe906a..39fac3132 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -402,25 +402,9 @@ SKILL_EOF --- -## Template Location - -All templates are at: -``` -https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/templates-only/src/templates/ -``` - ---- - ## How to Use a Template -### Option 1: Deploy Directly -``` -Repository: https://github.com/aculotti-verily/wb-app-mcp-and-context.git -Branch: templates-only -Folder: src/templates/ -``` - -### Option 2: Copy and Customize +### Copy and Customize 1. Copy the template folder to user's repo 2. Modify application code in `app/` 3. Update `devcontainer-template.json` with new name/description @@ -1779,7 +1763,6 @@ file:///home/jupyter/dashboard.html ← JavaScript blocked - Deployment checklist ### Quick Reference -- **Templates**: https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/templates-only/src/templates/ - **Full-featured apps**: https://github.com/verily-src/workbench-app-devcontainers --- From ece493af9ccb3cff14365f9c47bc6c8f04aa173e Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 17 Mar 2026 15:33:06 -0400 Subject: [PATCH 41/86] Add scientific skills for pharma/biotech research Hierarchical skill structure: - SKILL_INDEX.md - Master navigation for all skills - scientific/BIOINFORMATICS.md - scanpy, anndata, pydeseq2, biopython, scvelo - scientific/DRUG_DISCOVERY.md - rdkit, deepchem, chembl, drugbank, opentargets - scientific/GENOMICS_DATABASES.md - ensembl, uniprot, clinvar, pdb - scientific/DATA_ANALYSIS.md - sklearn, statsmodels, plotly, seaborn - scientific/CLINICAL.md - clinicaltrials.gov, pubmed, lifelines Based on claude-scientific-skills repo, consolidated into 5 domain indexes for efficient context loading. Updated CLAUDE.md template with skill triggers. --- features/src/llm-context/generate-context.sh | 378 +++++++++++++++++- .../src/llm-context/skills/SKILL_INDEX.md | 81 ++++ .../skills/scientific/BIOINFORMATICS.md | 212 ++++++++++ .../llm-context/skills/scientific/CLINICAL.md | 251 ++++++++++++ .../skills/scientific/DATA_ANALYSIS.md | 312 +++++++++++++++ .../skills/scientific/DRUG_DISCOVERY.md | 244 +++++++++++ .../skills/scientific/GENOMICS_DATABASES.md | 251 ++++++++++++ 7 files changed, 1725 insertions(+), 4 deletions(-) create mode 100644 features/src/llm-context/skills/SKILL_INDEX.md create mode 100644 features/src/llm-context/skills/scientific/BIOINFORMATICS.md create mode 100644 features/src/llm-context/skills/scientific/CLINICAL.md create mode 100644 features/src/llm-context/skills/scientific/DATA_ANALYSIS.md create mode 100644 features/src/llm-context/skills/scientific/DRUG_DISCOVERY.md create mode 100644 features/src/llm-context/skills/scientific/GENOMICS_DATABASES.md diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 39fac3132..3a435a4e8 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -1131,6 +1131,349 @@ gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSp - **VPC-SC:** Run \`gcloud batch\` commands from within workspace app - **Preemption:** If using spot VMs, set \`preemptible: 0\` for reliability WORKFLOW_SKILL_EOF + + # Create scientific skills directory and index + log_info "Creating scientific skills..." + mkdir -p "${SKILLS_DIR}/scientific" + + # Create SKILL_INDEX.md + cat > "${SKILLS_DIR}/SKILL_INDEX.md" << 'SKILL_INDEX_EOF' +# Skill Index + +**Read this file first to navigate available skills.** + +--- + +## ⚡ Quick Navigation + +| User Says... | Read This Skill | +|--------------|-----------------| +| "workflow failed" / "debug workflow" | `WORKFLOW_TROUBLESHOOT.md` | +| "create dashboard" / "visualize" / "Flask" | `DASHBOARD_BUILDER.md` | +| "create app" / "deploy app" | `CUSTOM_APP.md` | +| "single-cell" / "RNA-seq" / "scanpy" | `scientific/BIOINFORMATICS.md` | +| "molecule" / "drug" / "RDKit" / "ChEMBL" | `scientific/DRUG_DISCOVERY.md` | +| "gene" / "protein" / "variant" / "UniProt" | `scientific/GENOMICS_DATABASES.md` | +| "statistics" / "ML" / "plot" / "sklearn" | `scientific/DATA_ANALYSIS.md` | +| "clinical trial" / "PubMed" / "literature" | `scientific/CLINICAL.md` | + +--- + +## Workbench Skills + +| Skill | File | Description | +|-------|------|-------------| +| **Workflow Troubleshooting** | `WORKFLOW_TROUBLESHOOT.md` | Debug failed WDL/Nextflow workflows | +| **Dashboard Builder** | `DASHBOARD_BUILDER.md` | Create web apps, Flask, Streamlit | +| **Custom App** | `CUSTOM_APP.md` | Build deployable Workbench apps | + +--- + +## Scientific Skills + +### 🧬 Bioinformatics (`scientific/BIOINFORMATICS.md`) +Single-cell analysis, differential expression, sequence analysis, RNA velocity. +**Packages:** scanpy, anndata, biopython, pydeseq2, scvelo + +### 💊 Drug Discovery (`scientific/DRUG_DISCOVERY.md`) +Cheminformatics, molecular ML, bioactivity databases, target identification. +**Packages/APIs:** rdkit, deepchem, chembl, drugbank, opentargets + +### 🔬 Genomics Databases (`scientific/GENOMICS_DATABASES.md`) +Gene annotations, protein data, variant interpretation, 3D structures. +**APIs:** ensembl, uniprot, clinvar, pdb + +### 📊 Data Analysis (`scientific/DATA_ANALYSIS.md`) +Machine learning, statistics, visualization. +**Packages:** scikit-learn, statsmodels, plotly, seaborn + +### 🏥 Clinical (`scientific/CLINICAL.md`) +Clinical trials, literature search, survival analysis. +**APIs:** clinicaltrials.gov, pubmed +SKILL_INDEX_EOF + + # Create BIOINFORMATICS.md + cat > "${SKILLS_DIR}/scientific/BIOINFORMATICS.md" << 'BIOINFO_EOF' +# Bioinformatics Skills + +**Trigger:** Single-cell, RNA-seq, sequences, differential expression, trajectory. + +## Quick Reference +| Task | Package | Import | +|------|---------|--------| +| Single-cell workflow | scanpy | `import scanpy as sc` | +| Differential expression | pydeseq2 | `from pydeseq2 import DeseqDataSet` | +| Sequence analysis | biopython | `from Bio import SeqIO` | +| RNA velocity | scvelo | `import scvelo as scv` | + +## Scanpy Workflow +```python +import scanpy as sc +adata = sc.read_h5ad('data.h5ad') +sc.pp.calculate_qc_metrics(adata, inplace=True) +sc.pp.normalize_total(adata, target_sum=1e4) +sc.pp.log1p(adata) +sc.pp.highly_variable_genes(adata, n_top_genes=2000) +sc.tl.pca(adata) +sc.pp.neighbors(adata) +sc.tl.umap(adata) +sc.tl.leiden(adata) +sc.tl.rank_genes_groups(adata, 'leiden') +sc.pl.umap(adata, color='leiden') +``` + +## PyDESeq2 (Differential Expression) +```python +from pydeseq2.dds import DeseqDataSet +from pydeseq2.ds import DeseqStats +dds = DeseqDataSet(counts=counts.T, metadata=metadata, design_factors='condition') +dds.deseq2() +stat_res = DeseqStats(dds, contrast=['condition', 'treated', 'control']) +results = stat_res.results_df +sig = results[(results['padj'] < 0.05) & (abs(results['log2FoldChange']) > 1)] +``` + +## Biopython +```python +from Bio import SeqIO, Entrez +Entrez.email = "email@example.com" +# Parse FASTA +for record in SeqIO.parse('seq.fasta', 'fasta'): + print(record.id, len(record.seq)) +# NCBI fetch +handle = Entrez.efetch(db="nucleotide", id="NM_001301717", rettype="fasta") +``` + +Install: `pip install scanpy anndata pydeseq2 biopython scvelo` +BIOINFO_EOF + + # Create DRUG_DISCOVERY.md + cat > "${SKILLS_DIR}/scientific/DRUG_DISCOVERY.md" << 'DRUGDISC_EOF' +# Drug Discovery Skills + +**Trigger:** Molecules, SMILES, drugs, fingerprints, ADMET, targets, bioactivity. + +## Quick Reference +| Task | Tool | Access | +|------|------|--------| +| Molecular properties | rdkit | `from rdkit import Chem` | +| ADMET prediction | deepchem | `import deepchem as dc` | +| Bioactivity (IC50, Ki) | ChEMBL | REST API | +| Drug info | DrugBank | REST API | +| Target-disease | Open Targets | GraphQL | + +## RDKit +```python +from rdkit import Chem +from rdkit.Chem import Descriptors, AllChem, DataStructs + +mol = Chem.MolFromSmiles('CC(=O)OC1=CC=CC=C1C(=O)O') # Aspirin +mw = Descriptors.MolWt(mol) +logp = Descriptors.MolLogP(mol) +hbd = Descriptors.NumHDonors(mol) +hba = Descriptors.NumHAcceptors(mol) + +# Fingerprint similarity +fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=2) +fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=2) +similarity = DataStructs.TanimotoSimilarity(fp1, fp2) +``` + +## ChEMBL API +```python +from chembl_webresource_client.new_client import new_client +molecule = new_client.molecule +activity = new_client.activity +# Search compound +aspirin = molecule.filter(pref_name__iexact='aspirin')[0] +# Get activities for target +acts = activity.filter(target_chembl_id='CHEMBL230', pchembl_value__gte=6) +``` + +## Open Targets API +```python +import requests +query = '''query { target(ensemblId: "ENSG00000157764") { + approvedSymbol + associatedDiseases { rows { disease { name } score } } +}}''' +r = requests.post("https://api.platform.opentargets.org/api/v4/graphql", json={'query': query}) +``` + +Install: `pip install rdkit deepchem chembl_webresource_client` +DRUGDISC_EOF + + # Create GENOMICS_DATABASES.md + cat > "${SKILLS_DIR}/scientific/GENOMICS_DATABASES.md" << 'GENOMICS_EOF' +# Genomics Databases Skills + +**Trigger:** Genes, proteins, variants, structures, Ensembl, UniProt, ClinVar, PDB. + +## Quick Reference +| Need | Database | API | +|------|----------|-----| +| Gene annotations | Ensembl | REST | +| Protein data | UniProt | REST | +| Variant pathogenicity | ClinVar | E-utilities | +| 3D structures | PDB | REST | + +## Ensembl +```python +import requests +SERVER = "https://rest.ensembl.org" +# Gene lookup +gene = requests.get(f"{SERVER}/lookup/symbol/homo_sapiens/BRCA1", + headers={"Content-Type": "application/json"}).json() +# Sequence +seq = requests.get(f"{SERVER}/sequence/id/{gene['id']}").json() +``` + +## UniProt +```python +import requests +# Search protein +r = requests.get("https://rest.uniprot.org/uniprotkb/search", + params={"query": "gene:TP53 AND organism_id:9606", "format": "json"}) +# Get by ID +protein = requests.get("https://rest.uniprot.org/uniprotkb/P04637.json").json() +``` + +## ClinVar +```python +from Bio import Entrez +Entrez.email = "email@example.com" +handle = Entrez.esearch(db="clinvar", term="BRCA1[gene] AND pathogenic[clinsig]") +record = Entrez.read(handle) +``` + +## PDB +```python +import requests +# Get structure +structure = requests.get("https://data.rcsb.org/rest/v1/core/entry/1TUP").json() +# Download PDB file +pdb = requests.get("https://files.rcsb.org/download/1TUP.pdb").text +``` + +Install: `pip install biopython requests` +GENOMICS_EOF + + # Create DATA_ANALYSIS.md + cat > "${SKILLS_DIR}/scientific/DATA_ANALYSIS.md" << 'DATAANALYSIS_EOF' +# Data Analysis Skills + +**Trigger:** ML, statistics, visualization, sklearn, regression, clustering, plots. + +## Quick Reference +| Task | Package | Import | +|------|---------|--------| +| ML models | scikit-learn | `from sklearn.ensemble import RandomForestClassifier` | +| Statistics | statsmodels | `import statsmodels.api as sm` | +| Interactive plots | plotly | `import plotly.express as px` | +| Statistical plots | seaborn | `import seaborn as sns` | + +## Scikit-learn +```python +from sklearn.model_selection import train_test_split, cross_val_score +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import classification_report + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) +model = RandomForestClassifier(n_estimators=100) +model.fit(X_train, y_train) +print(classification_report(y_test, model.predict(X_test))) +cv_scores = cross_val_score(model, X, y, cv=5) +``` + +## Statsmodels +```python +import statsmodels.api as sm +X_const = sm.add_constant(X) +model = sm.OLS(y, X_const).fit() +print(model.summary()) # Full regression output with p-values +``` + +## Plotly +```python +import plotly.express as px +fig = px.scatter(df, x='x', y='y', color='category', hover_data=['name']) +fig.show() +fig = px.histogram(df, x='value', color='group') +fig = px.box(df, x='category', y='value') +``` + +## Seaborn +```python +import seaborn as sns +import matplotlib.pyplot as plt +sns.boxplot(data=df, x='category', y='value', hue='group') +sns.heatmap(df.corr(), annot=True, cmap='coolwarm') +sns.pairplot(df, hue='category') +plt.savefig('plot.png', dpi=300) +``` + +Install: `pip install scikit-learn statsmodels plotly seaborn` +DATAANALYSIS_EOF + + # Create CLINICAL.md + cat > "${SKILLS_DIR}/scientific/CLINICAL.md" << 'CLINICAL_EOF' +# Clinical Skills + +**Trigger:** Clinical trials, PubMed, literature, survival analysis. + +## Quick Reference +| Task | Source | Access | +|------|--------|--------| +| Clinical trials | ClinicalTrials.gov | REST API | +| Literature | PubMed | E-utilities | +| Survival analysis | lifelines | Python | + +## ClinicalTrials.gov API +```python +import requests +BASE = "https://clinicaltrials.gov/api/v2" +# Search trials +r = requests.get(f"{BASE}/studies", params={ + "query.cond": "breast cancer", + "query.intr": "pembrolizumab", + "filter.overallStatus": "RECRUITING" +}) +for study in r.json()['studies']: + info = study['protocolSection']['identificationModule'] + print(f"{info['nctId']}: {info['briefTitle']}") +``` + +## PubMed +```python +from Bio import Entrez +Entrez.email = "email@example.com" +handle = Entrez.esearch(db="pubmed", term="CRISPR cancer[Title/Abstract]", retmax=20) +pmids = Entrez.read(handle)['IdList'] +handle = Entrez.efetch(db="pubmed", id=pmids, rettype="abstract") +print(handle.read()) +``` + +## Survival Analysis (lifelines) +```python +from lifelines import KaplanMeierFitter, CoxPHFitter +from lifelines.statistics import logrank_test + +kmf = KaplanMeierFitter() +kmf.fit(durations, events, label='Survival') +kmf.plot_survival_function() + +# Compare groups +results = logrank_test(dur1, dur2, ev1, ev2) +print(f"p-value: {results.p_value:.4f}") + +# Cox regression +cph = CoxPHFitter() +cph.fit(df, duration_col='time', event_col='event') +cph.print_summary() +``` + +Install: `pip install biopython requests lifelines` +CLINICAL_EOF } # Fetch workspace information @@ -1769,13 +2112,25 @@ file:///home/jupyter/dashboard.html ← JavaScript blocked ## Available Skills -When users ask about specific topics, **read these skill files** for detailed guidance: +> **📚 Read \`~/.workbench/skills/SKILL_INDEX.md\` first** to navigate all available skills. + +### Workbench Skills | Topic | Skill File | When to Use | |-------|------------|-------------| -| **🚨 Dashboards, HTML, Flask, Web UIs** | \`~/.workbench/skills/DASHBOARD_BUILDER.md\` | **READ THIS FIRST** for any: dashboard, chart, visualization, Flask app, Streamlit, HTML page, web UI, interactive display, Plotly, or anything running on a port | -| Building custom apps | \`~/.workbench/skills/CUSTOM_APP.md\` | User wants to build a deployable app from scratch | -| **Troubleshoot failed workflows** | \`~/.workbench/skills/WORKFLOW_TROUBLESHOOT.md\` | Debug WDL/Nextflow failures, pull logs, check memory/disk, identify root cause | +| **🚨 Dashboards, Web UIs** | \`DASHBOARD_BUILDER.md\` | Dashboard, Flask, Streamlit, web UI, plots on a port | +| Building custom apps | \`CUSTOM_APP.md\` | Deployable Workbench apps | +| **Workflow debugging** | \`WORKFLOW_TROUBLESHOOT.md\` | Failed WDL/Nextflow, logs, memory/disk issues | + +### Scientific Skills + +| Domain | Skill File | Covers | +|--------|------------|--------| +| 🧬 Bioinformatics | \`scientific/BIOINFORMATICS.md\` | scanpy, anndata, pydeseq2, biopython, scvelo | +| 💊 Drug Discovery | \`scientific/DRUG_DISCOVERY.md\` | rdkit, deepchem, chembl, drugbank, opentargets | +| 🔬 Genomics DBs | \`scientific/GENOMICS_DATABASES.md\` | ensembl, uniprot, clinvar, pdb | +| 📊 Data Analysis | \`scientific/DATA_ANALYSIS.md\` | sklearn, statsmodels, plotly, seaborn | +| 🏥 Clinical | \`scientific/CLINICAL.md\` | clinicaltrials.gov, pubmed, lifelines | ### ⚡ Skill Trigger Guide @@ -1797,6 +2152,21 @@ When users ask about specific topics, **read these skill files** for detailed gu - "job failed" / "task failed" / "out of memory" - "check logs" / "why did it fail" / "troubleshoot" +**Read scientific/BIOINFORMATICS.md when:** +- "single-cell" / "RNA-seq" / "scanpy" / "differential expression" + +**Read scientific/DRUG_DISCOVERY.md when:** +- "molecule" / "SMILES" / "drug" / "RDKit" / "ChEMBL" / "target" + +**Read scientific/GENOMICS_DATABASES.md when:** +- "gene" / "protein" / "variant" / "UniProt" / "Ensembl" / "PDB" + +**Read scientific/DATA_ANALYSIS.md when:** +- "machine learning" / "sklearn" / "statistics" / "plot" + +**Read scientific/CLINICAL.md when:** +- "clinical trial" / "PubMed" / "survival analysis" + --- ## Quick Reference (Machine-Readable) diff --git a/features/src/llm-context/skills/SKILL_INDEX.md b/features/src/llm-context/skills/SKILL_INDEX.md new file mode 100644 index 000000000..7e658d35b --- /dev/null +++ b/features/src/llm-context/skills/SKILL_INDEX.md @@ -0,0 +1,81 @@ +# Skill Index + +**Read this file first to navigate available skills.** + +--- + +## ⚡ Quick Navigation + +| User Says... | Read This Skill | +|--------------|-----------------| +| "workflow failed" / "debug workflow" | `WORKFLOW_TROUBLESHOOT.md` | +| "create dashboard" / "visualize" / "Flask" | `DASHBOARD_BUILDER.md` | +| "create app" / "deploy app" | `CUSTOM_APP.md` | +| "single-cell" / "RNA-seq" / "scanpy" | `scientific/BIOINFORMATICS.md` | +| "molecule" / "drug" / "RDKit" / "ChEMBL" | `scientific/DRUG_DISCOVERY.md` | +| "gene" / "protein" / "variant" / "UniProt" | `scientific/GENOMICS_DATABASES.md` | +| "statistics" / "ML" / "plot" / "sklearn" | `scientific/DATA_ANALYSIS.md` | +| "clinical trial" / "PubMed" / "literature" | `scientific/CLINICAL.md` | + +--- + +## Workbench Skills + +Core skills for working within Verily Workbench: + +| Skill | File | Description | +|-------|------|-------------| +| **Workflow Troubleshooting** | `WORKFLOW_TROUBLESHOOT.md` | Debug failed WDL/Nextflow workflows | +| **Dashboard Builder** | `DASHBOARD_BUILDER.md` | Create web apps, Flask, Streamlit | +| **Custom App** | `CUSTOM_APP.md` | Build deployable Workbench apps | + +--- + +## Scientific Skills + +Domain-specific skills for pharma/biotech research: + +### 🧬 Bioinformatics (`scientific/BIOINFORMATICS.md`) +Single-cell analysis, differential expression, sequence analysis, RNA velocity. + +**Packages:** scanpy, anndata, biopython, pydeseq2, scvelo + +### 💊 Drug Discovery (`scientific/DRUG_DISCOVERY.md`) +Cheminformatics, molecular ML, bioactivity databases, target identification. + +**Packages/APIs:** rdkit, deepchem, chembl, drugbank, opentargets + +### 🔬 Genomics Databases (`scientific/GENOMICS_DATABASES.md`) +Gene annotations, protein data, variant interpretation, 3D structures. + +**APIs:** ensembl, uniprot, clinvar, pdb + +### 📊 Data Analysis (`scientific/DATA_ANALYSIS.md`) +Machine learning, statistics, visualization. + +**Packages:** scikit-learn, statsmodels, plotly, seaborn + +### 🏥 Clinical (`scientific/CLINICAL.md`) +Clinical trials, literature search, survival analysis. + +**APIs:** clinicaltrials.gov, pubmed + +--- + +## How to Use Skills + +1. **Claude reads this index first** when you ask a scientific question +2. **Claude then reads the relevant domain index** (e.g., `BIOINFORMATICS.md`) +3. **Domain indexes link to detailed skill files** when needed + +This hierarchy prevents context overload while ensuring Claude finds the right guidance. + +--- + +## Adding New Skills + +To add skills from [claude-scientific-skills](https://github.com/K-Dense-AI/claude-scientific-skills): + +1. Copy the `SKILL.md` file to `scientific/.md` +2. Add an entry to the relevant domain index +3. Update this index if adding a new category diff --git a/features/src/llm-context/skills/scientific/BIOINFORMATICS.md b/features/src/llm-context/skills/scientific/BIOINFORMATICS.md new file mode 100644 index 000000000..e8ce0da95 --- /dev/null +++ b/features/src/llm-context/skills/scientific/BIOINFORMATICS.md @@ -0,0 +1,212 @@ +# Bioinformatics Skills + +**Trigger:** User asks about single-cell analysis, RNA-seq, sequences, differential expression, or trajectory analysis. + +--- + +## Quick Reference + +| Task | Package | Quick Command | +|------|---------|---------------| +| Single-cell workflow | `scanpy` | `import scanpy as sc; adata = sc.read_h5ad('data.h5ad')` | +| Differential expression | `pydeseq2` | `from pydeseq2 import DeseqDataSet` | +| Sequence analysis | `biopython` | `from Bio import SeqIO` | +| RNA velocity | `scvelo` | `import scvelo as scv` | + +--- + +## Scanpy (Single-Cell Analysis) + +**Use for:** QC, normalization, PCA/UMAP, clustering, marker genes, cell type annotation. + +### Standard Workflow + +```python +import scanpy as sc + +# Load data +adata = sc.read_h5ad('data.h5ad') # or sc.read_10x_mtx('filtered_feature_bc_matrix/') + +# QC +sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True) +adata = adata[adata.obs['total_counts'] > 500] +adata = adata[adata.obs['pct_counts_mt'] < 20] + +# Normalize & log transform +sc.pp.normalize_total(adata, target_sum=1e4) +sc.pp.log1p(adata) + +# Find variable genes +sc.pp.highly_variable_genes(adata, n_top_genes=2000) +adata = adata[:, adata.var.highly_variable] + +# PCA, neighbors, UMAP, clustering +sc.tl.pca(adata) +sc.pp.neighbors(adata, n_pcs=30) +sc.tl.umap(adata) +sc.tl.leiden(adata, resolution=0.5) + +# Marker genes +sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon') +sc.pl.rank_genes_groups(adata, n_genes=10) + +# Visualization +sc.pl.umap(adata, color=['leiden', 'gene_of_interest']) +``` + +### Common File Formats +- `.h5ad` - AnnData format (standard) +- 10X Genomics: `filtered_feature_bc_matrix/` +- CSV: `sc.read_csv('counts.csv')` + +--- + +## AnnData (Data Structure) + +**Use for:** Creating, manipulating, and saving single-cell datasets. + +```python +import anndata as ad +import pandas as pd +import numpy as np + +# Create from scratch +adata = ad.AnnData( + X=count_matrix, # cells x genes + obs=cell_metadata_df, # cell annotations + var=gene_metadata_df # gene annotations +) + +# Key attributes +adata.X # Expression matrix +adata.obs # Cell metadata (DataFrame) +adata.var # Gene metadata (DataFrame) +adata.obsm['X_umap'] # Embeddings +adata.uns # Unstructured data + +# Subset +adata_subset = adata[adata.obs['cell_type'] == 'T cell', :] +adata_subset = adata[:, adata.var['highly_variable']] + +# Save/load +adata.write('output.h5ad') +adata = ad.read_h5ad('output.h5ad') + +# Concatenate datasets +adata_combined = ad.concat([adata1, adata2], join='outer') +``` + +--- + +## PyDESeq2 (Differential Expression) + +**Use for:** Bulk RNA-seq differential expression analysis. + +```python +import pandas as pd +from pydeseq2.dds import DeseqDataSet +from pydeseq2.ds import DeseqStats + +# Load count matrix (genes x samples) and metadata +counts = pd.read_csv('counts.csv', index_col=0) +metadata = pd.read_csv('metadata.csv', index_col=0) + +# Ensure sample order matches +counts = counts[metadata.index] + +# Create DESeq dataset +dds = DeseqDataSet( + counts=counts.T, # samples x genes + metadata=metadata, + design_factors='condition' # column in metadata +) + +# Run DESeq +dds.deseq2() + +# Get results +stat_res = DeseqStats(dds, contrast=['condition', 'treated', 'control']) +stat_res.summary() +results_df = stat_res.results_df + +# Filter significant genes +sig_genes = results_df[(results_df['padj'] < 0.05) & (abs(results_df['log2FoldChange']) > 1)] +``` + +--- + +## Biopython (Sequence Analysis) + +**Use for:** FASTA/GenBank parsing, BLAST, sequence manipulation, NCBI access. + +```python +from Bio import SeqIO, Entrez +from Bio.Seq import Seq + +# Parse FASTA +for record in SeqIO.parse('sequences.fasta', 'fasta'): + print(f"{record.id}: {len(record.seq)} bp") + +# Sequence manipulation +seq = Seq("ATGCGATCGATCG") +print(seq.complement()) +print(seq.reverse_complement()) +print(seq.translate()) + +# NCBI Entrez (always set email) +Entrez.email = "your.email@example.com" +handle = Entrez.efetch(db="nucleotide", id="NM_001301717", rettype="fasta", retmode="text") +record = SeqIO.read(handle, "fasta") + +# BLAST +from Bio.Blast import NCBIWWW, NCBIXML +result_handle = NCBIWWW.qblast("blastn", "nt", seq) +blast_records = NCBIXML.parse(result_handle) +``` + +--- + +## scVelo (RNA Velocity) + +**Use for:** Inferring cell state transitions and trajectory directions. + +```python +import scvelo as scv + +# Load data with spliced/unspliced counts +adata = scv.read('data.h5ad') # or from loom file + +# Preprocessing +scv.pp.filter_and_normalize(adata, min_shared_counts=20) +scv.pp.moments(adata, n_pcs=30, n_neighbors=30) + +# Velocity estimation +scv.tl.velocity(adata) +scv.tl.velocity_graph(adata) + +# Visualization +scv.pl.velocity_embedding_stream(adata, basis='umap') +scv.pl.velocity_embedding(adata, basis='umap', arrow_length=3) + +# Latent time +scv.tl.latent_time(adata) +scv.pl.scatter(adata, color='latent_time', cmap='viridis') + +# Driver genes +scv.tl.rank_velocity_genes(adata, groupby='clusters') +``` + +--- + +## Installation + +```bash +pip install scanpy anndata pydeseq2 biopython scvelo +``` + +--- + +## See Also + +- For interactive visualization → `DATA_ANALYSIS.md` (plotly, seaborn) +- For gene/protein databases → `GENOMICS_DATABASES.md` diff --git a/features/src/llm-context/skills/scientific/CLINICAL.md b/features/src/llm-context/skills/scientific/CLINICAL.md new file mode 100644 index 000000000..10efd9cbd --- /dev/null +++ b/features/src/llm-context/skills/scientific/CLINICAL.md @@ -0,0 +1,251 @@ +# Clinical Skills + +**Trigger:** User asks about clinical trials, PubMed, literature search, survival analysis, or patient data. + +--- + +## Quick Reference + +| Task | Source | Access | +|------|--------|--------| +| Clinical trial data | ClinicalTrials.gov | REST API (v2) | +| Literature search | PubMed | E-utilities API | +| Survival analysis | lifelines | Python package | + +--- + +## ClinicalTrials.gov + +**Use for:** Finding trials by condition/drug, trial status, study design, recruiting locations. + +### API v2 Queries + +```python +import requests + +BASE_URL = "https://clinicaltrials.gov/api/v2" + +# Search studies +response = requests.get(f"{BASE_URL}/studies", params={ + "query.cond": "breast cancer", + "query.intr": "pembrolizumab", + "filter.overallStatus": "RECRUITING", + "pageSize": 10 +}) +data = response.json() + +for study in data['studies']: + info = study['protocolSection']['identificationModule'] + status = study['protocolSection']['statusModule'] + print(f"{info['nctId']}: {info['briefTitle']}") + print(f" Status: {status['overallStatus']}") +``` + +### Get Study by NCT ID + +```python +nct_id = "NCT04379596" +response = requests.get(f"{BASE_URL}/studies/{nct_id}") +study = response.json() + +# Key sections +identification = study['protocolSection']['identificationModule'] +status = study['protocolSection']['statusModule'] +design = study['protocolSection']['designModule'] +eligibility = study['protocolSection']['eligibilityModule'] +outcomes = study['protocolSection'].get('outcomesModule', {}) + +print(f"Title: {identification['briefTitle']}") +print(f"Phase: {design.get('phases', ['N/A'])}") +print(f"Enrollment: {design.get('enrollmentInfo', {}).get('count', 'N/A')}") +``` + +### Search Parameters + +| Parameter | Description | Example | +|-----------|-------------|---------| +| `query.cond` | Condition/disease | "lung cancer" | +| `query.intr` | Intervention/drug | "nivolumab" | +| `query.term` | General search | "checkpoint inhibitor" | +| `filter.overallStatus` | Trial status | "RECRUITING", "COMPLETED" | +| `filter.geo` | Location | "distance(39.0,-77.1,50mi)" | +| `filter.advanced` | Phase, age, etc. | "AREA[Phase]PHASE3" | + +--- + +## PubMed (Literature Search) + +**Use for:** Finding papers, abstracts, citation data. + +### E-utilities API + +```python +from Bio import Entrez + +Entrez.email = "your.email@example.com" + +# Search PubMed +handle = Entrez.esearch( + db="pubmed", + term="CRISPR cancer therapy[Title/Abstract] AND 2023[pdat]", + retmax=20 +) +record = Entrez.read(handle) +pmids = record['IdList'] +print(f"Found {record['Count']} articles") + +# Fetch abstracts +handle = Entrez.efetch(db="pubmed", id=pmids, rettype="abstract", retmode="text") +abstracts = handle.read() +print(abstracts) + +# Fetch structured data +handle = Entrez.efetch(db="pubmed", id=pmids[:5], rettype="xml", retmode="xml") +from Bio import Medline +records = Medline.parse(handle) +for record in records: + print(f"Title: {record.get('TI', 'N/A')}") + print(f"Authors: {', '.join(record.get('AU', []))}") + print(f"Journal: {record.get('JT', 'N/A')}") + print() +``` + +### Search Syntax + +| Syntax | Description | Example | +|--------|-------------|---------| +| `[Title]` | Search title only | "cancer[Title]" | +| `[Title/Abstract]` | Title or abstract | "EGFR[Title/Abstract]" | +| `[Author]` | Author name | "Smith J[Author]" | +| `[Journal]` | Journal name | "Nature[Journal]" | +| `[pdat]` | Publication date | "2023[pdat]" | +| `AND`, `OR`, `NOT` | Boolean operators | "cancer AND therapy" | +| `[MeSH Terms]` | MeSH vocabulary | "Neoplasms[MeSH Terms]" | + +### REST API Alternative + +```python +import requests + +# E-utilities via REST +base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" + +# Search +search_url = f"{base_url}/esearch.fcgi" +response = requests.get(search_url, params={ + "db": "pubmed", + "term": "immunotherapy melanoma", + "retmode": "json", + "retmax": 10 +}) +pmids = response.json()['esearchresult']['idlist'] + +# Fetch summaries +summary_url = f"{base_url}/esummary.fcgi" +response = requests.get(summary_url, params={ + "db": "pubmed", + "id": ",".join(pmids), + "retmode": "json" +}) +summaries = response.json()['result'] +``` + +--- + +## Survival Analysis (Lifelines) + +**Use for:** Kaplan-Meier curves, Cox regression, time-to-event analysis. + +### Kaplan-Meier Estimator + +```python +from lifelines import KaplanMeierFitter +import matplotlib.pyplot as plt + +# Data format: duration (time), event (1=occurred, 0=censored) +durations = [5, 6, 6, 2.5, 4, 4, 1, 2, 3, 4, 5, 6] +events = [1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1] + +kmf = KaplanMeierFitter() +kmf.fit(durations, events, label='Overall Survival') + +# Plot survival curve +kmf.plot_survival_function() +plt.xlabel('Time (months)') +plt.ylabel('Survival Probability') +plt.title('Kaplan-Meier Survival Curve') +plt.show() + +# Median survival +print(f"Median survival: {kmf.median_survival_time_}") + +# Survival at specific time +print(f"Survival at 12 months: {kmf.predict(12):.2%}") +``` + +### Compare Groups + +```python +from lifelines.statistics import logrank_test + +# Group 1 +kmf1 = KaplanMeierFitter() +kmf1.fit(durations_group1, events_group1, label='Treatment') + +# Group 2 +kmf2 = KaplanMeierFitter() +kmf2.fit(durations_group2, events_group2, label='Control') + +# Plot both +ax = kmf1.plot_survival_function() +kmf2.plot_survival_function(ax=ax) +plt.show() + +# Log-rank test +results = logrank_test(durations_group1, durations_group2, events_group1, events_group2) +print(f"Log-rank p-value: {results.p_value:.4f}") +``` + +### Cox Proportional Hazards + +```python +from lifelines import CoxPHFitter +import pandas as pd + +# Data with covariates +df = pd.DataFrame({ + 'duration': durations, + 'event': events, + 'age': [45, 50, 55, 60, 48, 52, 58, 62, 49, 51, 53, 57], + 'treatment': [1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0] +}) + +cph = CoxPHFitter() +cph.fit(df, duration_col='duration', event_col='event') + +# Summary with hazard ratios +cph.print_summary() + +# Hazard ratios +print(f"\nHazard Ratios:") +print(cph.hazard_ratios_) + +# Plot coefficients +cph.plot() +plt.show() +``` + +--- + +## Installation + +```bash +pip install biopython requests lifelines matplotlib +``` + +--- + +## See Also + +- For drug/target data → `DRUG_DISCOVERY.md` +- For visualization → `DATA_ANALYSIS.md` diff --git a/features/src/llm-context/skills/scientific/DATA_ANALYSIS.md b/features/src/llm-context/skills/scientific/DATA_ANALYSIS.md new file mode 100644 index 000000000..9c496201c --- /dev/null +++ b/features/src/llm-context/skills/scientific/DATA_ANALYSIS.md @@ -0,0 +1,312 @@ +# Data Analysis Skills + +**Trigger:** User asks about ML, statistics, visualization, plots, sklearn, regression, or classification. + +--- + +## Quick Reference + +| Task | Package | Quick Import | +|------|---------|--------------| +| ML models (classification, regression) | `scikit-learn` | `from sklearn.ensemble import RandomForestClassifier` | +| Statistical tests, regression | `statsmodels` | `import statsmodels.api as sm` | +| Interactive plots | `plotly` | `import plotly.express as px` | +| Statistical visualization | `seaborn` | `import seaborn as sns` | + +--- + +## Scikit-learn (Machine Learning) + +**Use for:** Classification, regression, clustering, dimensionality reduction, model evaluation. + +### Classification + +```python +from sklearn.model_selection import train_test_split, cross_val_score +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import classification_report, confusion_matrix +import pandas as pd + +# Load data +df = pd.read_csv('data.csv') +X = df.drop('target', axis=1) +y = df['target'] + +# Split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Train +model = RandomForestClassifier(n_estimators=100, random_state=42) +model.fit(X_train, y_train) + +# Evaluate +y_pred = model.predict(X_test) +print(classification_report(y_test, y_pred)) +print(confusion_matrix(y_test, y_pred)) + +# Cross-validation +cv_scores = cross_val_score(model, X, y, cv=5) +print(f"CV Accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}") + +# Feature importance +importance = pd.DataFrame({ + 'feature': X.columns, + 'importance': model.feature_importances_ +}).sort_values('importance', ascending=False) +``` + +### Regression + +```python +from sklearn.linear_model import LinearRegression, Ridge, Lasso +from sklearn.metrics import mean_squared_error, r2_score + +model = Ridge(alpha=1.0) +model.fit(X_train, y_train) + +y_pred = model.predict(X_test) +print(f"R²: {r2_score(y_test, y_pred):.3f}") +print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.3f}") +``` + +### Clustering + +```python +from sklearn.cluster import KMeans +from sklearn.preprocessing import StandardScaler + +# Scale features +scaler = StandardScaler() +X_scaled = scaler.fit_transform(X) + +# K-Means +kmeans = KMeans(n_clusters=3, random_state=42) +clusters = kmeans.fit_predict(X_scaled) + +# Evaluate +from sklearn.metrics import silhouette_score +score = silhouette_score(X_scaled, clusters) +print(f"Silhouette Score: {score:.3f}") +``` + +### Dimensionality Reduction + +```python +from sklearn.decomposition import PCA +from sklearn.manifold import TSNE + +# PCA +pca = PCA(n_components=2) +X_pca = pca.fit_transform(X_scaled) +print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2%}") + +# t-SNE +tsne = TSNE(n_components=2, random_state=42) +X_tsne = tsne.fit_transform(X_scaled) +``` + +--- + +## Statsmodels (Statistical Analysis) + +**Use for:** Regression with diagnostics, statistical tests, time series. + +### Linear Regression with Diagnostics + +```python +import statsmodels.api as sm +import pandas as pd + +# Add constant for intercept +X_const = sm.add_constant(X) + +# Fit OLS +model = sm.OLS(y, X_const).fit() + +# Full summary with p-values, R², etc. +print(model.summary()) + +# Key metrics +print(f"R-squared: {model.rsquared:.3f}") +print(f"Adj. R-squared: {model.rsquared_adj:.3f}") +print(f"F-statistic p-value: {model.f_pvalue:.2e}") + +# Coefficients with confidence intervals +print(model.conf_int()) +``` + +### Logistic Regression + +```python +model = sm.Logit(y, X_const).fit() +print(model.summary()) + +# Odds ratios +import numpy as np +odds_ratios = np.exp(model.params) +``` + +### Statistical Tests + +```python +from scipy import stats + +# t-test +t_stat, p_value = stats.ttest_ind(group1, group2) + +# ANOVA +f_stat, p_value = stats.f_oneway(group1, group2, group3) + +# Chi-square test +chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table) + +# Correlation +corr, p_value = stats.pearsonr(x, y) +corr, p_value = stats.spearmanr(x, y) + +# Normality test +stat, p_value = stats.shapiro(data) +``` + +--- + +## Plotly (Interactive Visualization) + +**Use for:** Interactive charts, dashboards, web-embeddable plots. + +### Basic Plots + +```python +import plotly.express as px +import pandas as pd + +df = pd.read_csv('data.csv') + +# Scatter plot +fig = px.scatter(df, x='x', y='y', color='category', + hover_data=['name'], title='Scatter Plot') +fig.show() + +# Bar chart +fig = px.bar(df, x='category', y='value', color='group') +fig.show() + +# Line plot +fig = px.line(df, x='date', y='value', color='series') +fig.show() + +# Histogram +fig = px.histogram(df, x='value', nbins=30, color='group') +fig.show() + +# Box plot +fig = px.box(df, x='category', y='value', color='group') +fig.show() +``` + +### Advanced Features + +```python +import plotly.graph_objects as go + +# Multiple traces +fig = go.Figure() +fig.add_trace(go.Scatter(x=x1, y=y1, name='Series 1')) +fig.add_trace(go.Scatter(x=x2, y=y2, name='Series 2')) +fig.update_layout(title='Multi-series Plot') +fig.show() + +# Heatmap +fig = px.imshow(correlation_matrix, text_auto=True, color_continuous_scale='RdBu_r') +fig.show() + +# 3D scatter +fig = px.scatter_3d(df, x='x', y='y', z='z', color='category') +fig.show() +``` + +--- + +## Seaborn (Statistical Visualization) + +**Use for:** Publication-quality statistical plots with pandas integration. + +### Distribution Plots + +```python +import seaborn as sns +import matplotlib.pyplot as plt + +# Histogram with KDE +sns.histplot(data=df, x='value', hue='group', kde=True) +plt.show() + +# KDE plot +sns.kdeplot(data=df, x='value', hue='group', fill=True) +plt.show() + +# Box plot +sns.boxplot(data=df, x='category', y='value', hue='group') +plt.show() + +# Violin plot +sns.violinplot(data=df, x='category', y='value', hue='group', split=True) +plt.show() +``` + +### Relationship Plots + +```python +# Scatter with regression line +sns.regplot(data=df, x='x', y='y') +plt.show() + +# Joint plot (scatter + marginal distributions) +sns.jointplot(data=df, x='x', y='y', kind='reg') +plt.show() + +# Pair plot (all pairwise relationships) +sns.pairplot(df, hue='category') +plt.show() +``` + +### Heatmaps + +```python +# Correlation heatmap +corr = df.corr() +sns.heatmap(corr, annot=True, cmap='coolwarm', center=0) +plt.show() + +# Clustermap (hierarchical clustering) +sns.clustermap(corr, annot=True, cmap='coolwarm') +plt.show() +``` + +### Styling + +```python +# Set theme +sns.set_theme(style='whitegrid') # darkgrid, white, dark, ticks + +# Figure size +plt.figure(figsize=(10, 6)) + +# Save figure +plt.savefig('plot.png', dpi=300, bbox_inches='tight') +``` + +--- + +## Installation + +```bash +pip install scikit-learn statsmodels plotly seaborn matplotlib pandas +``` + +--- + +## See Also + +- For domain-specific analysis → `BIOINFORMATICS.md`, `DRUG_DISCOVERY.md` +- For dashboards in Workbench → `DASHBOARD_BUILDER.md` diff --git a/features/src/llm-context/skills/scientific/DRUG_DISCOVERY.md b/features/src/llm-context/skills/scientific/DRUG_DISCOVERY.md new file mode 100644 index 000000000..ce1ff3bd4 --- /dev/null +++ b/features/src/llm-context/skills/scientific/DRUG_DISCOVERY.md @@ -0,0 +1,244 @@ +# Drug Discovery Skills + +**Trigger:** User asks about molecules, compounds, drugs, SMILES, fingerprints, ADMET, targets, or bioactivity. + +--- + +## Quick Reference + +| Task | Tool | Quick Access | +|------|------|--------------| +| Molecular structure/properties | `rdkit` | `from rdkit import Chem` | +| ADMET/property prediction | `deepchem` | `import deepchem as dc` | +| Bioactivity data (IC50, Ki) | ChEMBL | REST API | +| Drug info & interactions | DrugBank | REST API | +| Target-disease associations | Open Targets | GraphQL API | + +--- + +## RDKit (Cheminformatics) + +**Use for:** SMILES parsing, molecular descriptors, fingerprints, substructure search, similarity. + +### Basic Operations + +```python +from rdkit import Chem +from rdkit.Chem import Descriptors, AllChem, Draw + +# Parse SMILES +mol = Chem.MolFromSmiles('CC(=O)OC1=CC=CC=C1C(=O)O') # Aspirin +if mol is None: + print("Invalid SMILES") + +# Calculate properties +mw = Descriptors.MolWt(mol) +logp = Descriptors.MolLogP(mol) +hbd = Descriptors.NumHDonors(mol) +hba = Descriptors.NumHAcceptors(mol) +tpsa = Descriptors.TPSA(mol) +rotatable = Descriptors.NumRotatableBonds(mol) + +print(f"MW: {mw:.2f}, LogP: {logp:.2f}, HBD: {hbd}, HBA: {hba}, TPSA: {tpsa:.2f}") + +# Lipinski's Rule of 5 +lipinski_pass = mw <= 500 and logp <= 5 and hbd <= 5 and hba <= 10 +``` + +### Fingerprints & Similarity + +```python +from rdkit import DataStructs +from rdkit.Chem import AllChem + +mol1 = Chem.MolFromSmiles('CCO') +mol2 = Chem.MolFromSmiles('CCCO') + +# Morgan fingerprint (ECFP-like) +fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=2, nBits=2048) +fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=2, nBits=2048) + +# Tanimoto similarity +similarity = DataStructs.TanimotoSimilarity(fp1, fp2) +print(f"Similarity: {similarity:.3f}") +``` + +### Substructure Search + +```python +# Define substructure pattern +pattern = Chem.MolFromSmarts('c1ccccc1') # benzene ring + +# Check if molecule contains pattern +has_benzene = mol.HasSubstructMatch(pattern) + +# Find all matches +matches = mol.GetSubstructMatches(pattern) +``` + +--- + +## DeepChem (Molecular ML) + +**Use for:** Property prediction, ADMET, toxicity, binding affinity. + +```python +import deepchem as dc + +# Load MoleculeNet dataset +tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='ECFP') +train, valid, test = datasets + +# Quick model training +model = dc.models.MultitaskClassifier(n_tasks=1, n_features=1024) +model.fit(train, nb_epoch=10) + +# Predict on new molecules +smiles = ['CCO', 'CC(=O)O', 'c1ccccc1'] +featurizer = dc.feat.CircularFingerprint(size=1024) +features = featurizer.featurize(smiles) +predictions = model.predict_on_batch(features) +``` + +### Pre-trained Models + +```python +# Load pre-trained toxicity model +tox21_tasks, tox21_datasets, tox21_transformers = dc.molnet.load_tox21() + +# ADMET prediction +# Use relevant MoleculeNet datasets: BBBP, ClinTox, SIDER, etc. +``` + +--- + +## ChEMBL Database + +**Use for:** Bioactivity data, IC50/Ki values, target information. + +### REST API Queries + +```python +import requests + +BASE_URL = "https://www.ebi.ac.uk/chembl/api/data" + +# Search compound by name +response = requests.get(f"{BASE_URL}/molecule/search.json?q=aspirin") +results = response.json()['molecules'] + +# Get bioactivity for a target (e.g., COX-2) +target_id = "CHEMBL230" # COX-2 +response = requests.get(f"{BASE_URL}/activity.json?target_chembl_id={target_id}&limit=100") +activities = response.json()['activities'] + +for act in activities[:5]: + print(f"{act['molecule_chembl_id']}: {act['standard_type']} = {act['standard_value']} {act['standard_units']}") +``` + +### Using chembl_webresource_client + +```python +from chembl_webresource_client.new_client import new_client + +# Search molecules +molecule = new_client.molecule +aspirin = molecule.filter(pref_name__iexact='aspirin')[0] + +# Get activities for target +activity = new_client.activity +target_activities = activity.filter(target_chembl_id='CHEMBL230', pchembl_value__gte=6) + +# Search by SMILES similarity +similarity = new_client.similarity +similar_mols = similarity.filter(smiles='CC(=O)Oc1ccccc1C(=O)O', similarity=70) +``` + +--- + +## DrugBank + +**Use for:** Approved drug information, drug-drug interactions, mechanisms. + +```python +import requests + +# Note: DrugBank API requires authentication for full access +# Free tier available at https://go.drugbank.com/ + +# Example: Search drug by name (requires API key) +headers = {'Authorization': 'Bearer YOUR_API_KEY'} +response = requests.get( + 'https://api.drugbank.com/v1/drugs', + params={'q': 'metformin'}, + headers=headers +) +``` + +### DrugBank Data Fields +- Drug name, description, indication +- Mechanism of action +- Drug-drug interactions +- Targets and enzymes +- ADMET properties +- Chemical structure (SMILES, InChI) + +--- + +## Open Targets + +**Use for:** Target-disease associations, genetic evidence, known drugs. + +### GraphQL API + +```python +import requests + +ENDPOINT = "https://api.platform.opentargets.org/api/v4/graphql" + +# Query target-disease associations +query = """ +query targetAssociations($ensemblId: String!) { + target(ensemblId: $ensemblId) { + id + approvedSymbol + associatedDiseases { + rows { + disease { id name } + score + } + } + } +} +""" + +response = requests.post(ENDPOINT, json={ + 'query': query, + 'variables': {'ensemblId': 'ENSG00000157764'} # BRAF +}) +data = response.json()['data']['target'] + +for assoc in data['associatedDiseases']['rows'][:5]: + print(f"{assoc['disease']['name']}: {assoc['score']:.3f}") +``` + +### Common Queries +- Target tractability and safety +- Known drugs for a disease +- Genetic associations (GWAS) +- Pathway information + +--- + +## Installation + +```bash +pip install rdkit deepchem chembl_webresource_client requests +``` + +--- + +## See Also + +- For protein structures → `GENOMICS_DATABASES.md` (PDB, UniProt) +- For clinical trials → `CLINICAL.md` diff --git a/features/src/llm-context/skills/scientific/GENOMICS_DATABASES.md b/features/src/llm-context/skills/scientific/GENOMICS_DATABASES.md new file mode 100644 index 000000000..4939f2884 --- /dev/null +++ b/features/src/llm-context/skills/scientific/GENOMICS_DATABASES.md @@ -0,0 +1,251 @@ +# Genomics Databases Skills + +**Trigger:** User asks about genes, proteins, variants, structures, annotations, Ensembl, UniProt, ClinVar, or PDB. + +--- + +## Quick Reference + +| Need | Database | API | +|------|----------|-----| +| Gene annotations, sequences | Ensembl | REST | +| Protein sequences, functions | UniProt | REST | +| Variant clinical significance | ClinVar | E-utilities | +| 3D protein structures | PDB/RCSB | REST | + +--- + +## Ensembl (Gene Annotations) + +**Use for:** Gene lookups, sequences, variant effect prediction (VEP), orthologs. + +### REST API + +```python +import requests + +SERVER = "https://rest.ensembl.org" + +def ensembl_get(endpoint, params=None): + response = requests.get(f"{SERVER}{endpoint}", + headers={"Content-Type": "application/json"}, + params=params) + return response.json() + +# Lookup gene by symbol +gene = ensembl_get("/lookup/symbol/homo_sapiens/BRCA1", {"expand": 1}) +print(f"Gene ID: {gene['id']}, Location: {gene['seq_region_name']}:{gene['start']}-{gene['end']}") + +# Get gene sequence +seq = ensembl_get(f"/sequence/id/{gene['id']}", {"type": "genomic"}) +print(f"Sequence length: {len(seq['seq'])} bp") + +# Variant Effect Predictor (VEP) +vep_result = requests.post( + f"{SERVER}/vep/human/region", + headers={"Content-Type": "application/json"}, + json={"variants": ["17 41234451 . A G . . ."]} # VCF format +).json() +``` + +### Common Endpoints +- `/lookup/symbol/{species}/{symbol}` - Gene by symbol +- `/lookup/id/{id}` - By Ensembl ID +- `/sequence/id/{id}` - Get sequence +- `/homology/id/{id}` - Orthologs/paralogs +- `/vep/{species}/region` - Variant effects + +--- + +## UniProt (Protein Data) + +**Use for:** Protein sequences, functions, domains, GO terms, cross-references. + +### REST API + +```python +import requests + +BASE_URL = "https://rest.uniprot.org/uniprotkb" + +# Search proteins +response = requests.get(f"{BASE_URL}/search", params={ + "query": "gene:TP53 AND organism_id:9606", + "format": "json", + "size": 5 +}) +results = response.json()['results'] + +for entry in results: + print(f"{entry['primaryAccession']}: {entry['proteinDescription']['recommendedName']['fullName']['value']}") + +# Get specific protein +protein = requests.get(f"{BASE_URL}/P04637.json").json() +print(f"Length: {protein['sequence']['length']} aa") + +# Get FASTA sequence +fasta = requests.get(f"{BASE_URL}/P04637.fasta").text + +# ID mapping (convert between databases) +mapping_response = requests.post( + "https://rest.uniprot.org/idmapping/run", + data={"from": "UniProtKB_AC-ID", "to": "Ensembl", "ids": "P04637"} +) +``` + +### Key Fields +- `primaryAccession` - UniProt ID (e.g., P04637) +- `proteinDescription` - Protein name +- `genes` - Gene names +- `sequence` - Amino acid sequence +- `features` - Domains, variants, modifications +- `uniProtKBCrossReferences` - Links to other databases + +--- + +## ClinVar (Variant Clinical Significance) + +**Use for:** Variant pathogenicity, clinical interpretations, disease associations. + +### E-utilities API + +```python +from Bio import Entrez +import xml.etree.ElementTree as ET + +Entrez.email = "your.email@example.com" + +# Search variants by gene +handle = Entrez.esearch(db="clinvar", term="BRCA1[gene] AND pathogenic[clinsig]", retmax=10) +record = Entrez.read(handle) +variant_ids = record['IdList'] + +# Get variant details +for vid in variant_ids[:3]: + handle = Entrez.efetch(db="clinvar", id=vid, rettype="vcv", retmode="xml") + # Parse XML response + print(f"Variant ID: {vid}") +``` + +### Direct REST Query + +```python +import requests + +# Search by gene +response = requests.get( + "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi", + params={ + "db": "clinvar", + "term": "BRCA1[gene]", + "retmode": "json", + "retmax": 100 + } +) +ids = response.json()['esearchresult']['idlist'] +``` + +### Clinical Significance Categories +- Pathogenic +- Likely pathogenic +- Uncertain significance (VUS) +- Likely benign +- Benign + +--- + +## PDB/RCSB (Protein Structures) + +**Use for:** 3D structures, structural analysis, drug binding sites. + +### REST API + +```python +import requests + +RCSB_URL = "https://data.rcsb.org/rest/v1/core" +SEARCH_URL = "https://search.rcsb.org/rcsbsearch/v2/query" + +# Get structure metadata +pdb_id = "1TUP" # p53 DNA-binding domain +structure = requests.get(f"{RCSB_URL}/entry/{pdb_id}").json() +print(f"Title: {structure['struct']['title']}") +print(f"Resolution: {structure['rcsb_entry_info'].get('resolution_combined', ['N/A'])} Å") + +# Search structures +search_query = { + "query": { + "type": "terminal", + "service": "full_text", + "parameters": {"value": "kinase inhibitor"} + }, + "return_type": "entry" +} +results = requests.post(SEARCH_URL, json=search_query).json() + +# Download structure file +pdb_file = requests.get(f"https://files.rcsb.org/download/{pdb_id}.pdb").text +cif_file = requests.get(f"https://files.rcsb.org/download/{pdb_id}.cif").text +``` + +### Working with Structure Files + +```python +from Bio.PDB import PDBParser + +parser = PDBParser() +structure = parser.get_structure("protein", "1TUP.pdb") + +for model in structure: + for chain in model: + print(f"Chain {chain.id}: {len(list(chain.get_residues()))} residues") +``` + +--- + +## Combined Workflow Example + +```python +# Find drug targets for a disease, get protein info, check structures + +import requests + +# 1. Open Targets: Find targets for disease +disease_id = "EFO_0000311" # Cancer +# ... (see DRUG_DISCOVERY.md) + +# 2. UniProt: Get protein details +gene = "EGFR" +uniprot = requests.get( + f"https://rest.uniprot.org/uniprotkb/search", + params={"query": f"gene:{gene} AND organism_id:9606", "format": "json"} +).json()['results'][0] +uniprot_id = uniprot['primaryAccession'] + +# 3. PDB: Find structures +pdb_search = { + "query": { + "type": "terminal", + "service": "text", + "parameters": {"attribute": "rcsb_polymer_entity.pdbx_description", "value": gene} + }, + "return_type": "entry" +} +structures = requests.post("https://search.rcsb.org/rcsbsearch/v2/query", json=pdb_search).json() +print(f"Found {structures['total_count']} structures for {gene}") +``` + +--- + +## Installation + +```bash +pip install biopython requests +``` + +--- + +## See Also + +- For sequence analysis → `BIOINFORMATICS.md` (Biopython) +- For drug-target data → `DRUG_DISCOVERY.md` (ChEMBL, Open Targets) From bd8ac9ec9b83ebf273d5a0037b5ba7e71855ebc5 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 17 Mar 2026 16:06:11 -0400 Subject: [PATCH 42/86] Rename SKILL_INDEX to SCIENTIFIC_SKILLS_INDEX, update skill routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename SKILL_INDEX.md → SCIENTIFIC_SKILLS_INDEX.md to clarify it only routes scientific skills - Workbench skills (workflow, dashboard, custom app) now routed directly from CLAUDE.md trigger guide - Expand workflow troubleshoot triggers: "fix my workflow", "troubleshoot my job", "my job failed" - Update generate-context.sh to match all naming and routing changes Made-with: Cursor --- features/src/llm-context/generate-context.sh | 72 ++++++++--------- .../skills/SCIENTIFIC_SKILLS_INDEX.md | 50 ++++++++++++ .../src/llm-context/skills/SKILL_INDEX.md | 81 ------------------- 3 files changed, 83 insertions(+), 120 deletions(-) create mode 100644 features/src/llm-context/skills/SCIENTIFIC_SKILLS_INDEX.md delete mode 100644 features/src/llm-context/skills/SKILL_INDEX.md diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 3a435a4e8..e65391376 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -1136,11 +1136,12 @@ WORKFLOW_SKILL_EOF log_info "Creating scientific skills..." mkdir -p "${SKILLS_DIR}/scientific" - # Create SKILL_INDEX.md - cat > "${SKILLS_DIR}/SKILL_INDEX.md" << 'SKILL_INDEX_EOF' -# Skill Index + # Create SCIENTIFIC_SKILLS_INDEX.md + cat > "${SKILLS_DIR}/SCIENTIFIC_SKILLS_INDEX.md" << 'SCIENTIFIC_SKILLS_EOF' +# Scientific Skills Index -**Read this file first to navigate available skills.** +**This file routes Claude to domain-specific scientific skills.** +Workbench skills (workflows, dashboards, custom apps) are handled directly by `CLAUDE.md`. --- @@ -1148,28 +1149,15 @@ WORKFLOW_SKILL_EOF | User Says... | Read This Skill | |--------------|-----------------| -| "workflow failed" / "debug workflow" | `WORKFLOW_TROUBLESHOOT.md` | -| "create dashboard" / "visualize" / "Flask" | `DASHBOARD_BUILDER.md` | -| "create app" / "deploy app" | `CUSTOM_APP.md` | -| "single-cell" / "RNA-seq" / "scanpy" | `scientific/BIOINFORMATICS.md` | -| "molecule" / "drug" / "RDKit" / "ChEMBL" | `scientific/DRUG_DISCOVERY.md` | -| "gene" / "protein" / "variant" / "UniProt" | `scientific/GENOMICS_DATABASES.md` | -| "statistics" / "ML" / "plot" / "sklearn" | `scientific/DATA_ANALYSIS.md` | -| "clinical trial" / "PubMed" / "literature" | `scientific/CLINICAL.md` | +| "single-cell" / "RNA-seq" / "scanpy" / "differential expression" | `scientific/BIOINFORMATICS.md` | +| "molecule" / "SMILES" / "drug" / "RDKit" / "ChEMBL" / "target" | `scientific/DRUG_DISCOVERY.md` | +| "gene" / "protein" / "variant" / "UniProt" / "Ensembl" / "PDB" | `scientific/GENOMICS_DATABASES.md` | +| "machine learning" / "sklearn" / "statistics" / "plot" | `scientific/DATA_ANALYSIS.md` | +| "clinical trial" / "PubMed" / "survival analysis" | `scientific/CLINICAL.md` | --- -## Workbench Skills - -| Skill | File | Description | -|-------|------|-------------| -| **Workflow Troubleshooting** | `WORKFLOW_TROUBLESHOOT.md` | Debug failed WDL/Nextflow workflows | -| **Dashboard Builder** | `DASHBOARD_BUILDER.md` | Create web apps, Flask, Streamlit | -| **Custom App** | `CUSTOM_APP.md` | Build deployable Workbench apps | - ---- - -## Scientific Skills +## Domain Skills ### 🧬 Bioinformatics (`scientific/BIOINFORMATICS.md`) Single-cell analysis, differential expression, sequence analysis, RNA velocity. @@ -1190,7 +1178,17 @@ Machine learning, statistics, visualization. ### 🏥 Clinical (`scientific/CLINICAL.md`) Clinical trials, literature search, survival analysis. **APIs:** clinicaltrials.gov, pubmed -SKILL_INDEX_EOF + +--- + +## Adding New Skills + +To add skills from [claude-scientific-skills](https://github.com/K-Dense-AI/claude-scientific-skills): + +1. Copy the `SKILL.md` file to `scientific/.md` +2. Add a row to the Quick Navigation table above +3. Add a domain section below +SCIENTIFIC_SKILLS_EOF # Create BIOINFORMATICS.md cat > "${SKILLS_DIR}/scientific/BIOINFORMATICS.md" << 'BIOINFO_EOF' @@ -2112,10 +2110,10 @@ file:///home/jupyter/dashboard.html ← JavaScript blocked ## Available Skills -> **📚 Read \`~/.workbench/skills/SKILL_INDEX.md\` first** to navigate all available skills. - ### Workbench Skills +Read these directly — no index needed: + | Topic | Skill File | When to Use | |-------|------------|-------------| | **🚨 Dashboards, Web UIs** | \`DASHBOARD_BUILDER.md\` | Dashboard, Flask, Streamlit, web UI, plots on a port | @@ -2124,6 +2122,8 @@ file:///home/jupyter/dashboard.html ← JavaScript blocked ### Scientific Skills +> **📚 Read \`~/.workbench/skills/SCIENTIFIC_SKILLS_INDEX.md\` first** to navigate scientific domain skills. + | Domain | Skill File | Covers | |--------|------------|--------| | 🧬 Bioinformatics | \`scientific/BIOINFORMATICS.md\` | scanpy, anndata, pydeseq2, biopython, scvelo | @@ -2143,28 +2143,22 @@ file:///home/jupyter/dashboard.html ← JavaScript blocked - "show in browser" / "open in new tab" - Any request to display data interactively -**Read CUSTOM_APP.md when:** +**Read \`CUSTOM_APP.md\` when:** - "build a deployable app" / "create a custom app" - "API service" / "backend" / "from scratch" -**Read WORKFLOW_TROUBLESHOOT.md when:** +**Read \`WORKFLOW_TROUBLESHOOT.md\` when:** +- "troubleshoot my workflow" / "fix my workflow" - "my workflow failed" / "workflow error" / "debug workflow" +- "troubleshoot my job" / "my job failed" / "workflow job failed" - "job failed" / "task failed" / "out of memory" - "check logs" / "why did it fail" / "troubleshoot" -**Read scientific/BIOINFORMATICS.md when:** +**Read \`SCIENTIFIC_SKILLS_INDEX.md\` then the relevant domain file when user mentions:** - "single-cell" / "RNA-seq" / "scanpy" / "differential expression" - -**Read scientific/DRUG_DISCOVERY.md when:** -- "molecule" / "SMILES" / "drug" / "RDKit" / "ChEMBL" / "target" - -**Read scientific/GENOMICS_DATABASES.md when:** +- "molecule" / "SMILES" / "drug" / "RDKit" / "ChEMBL" - "gene" / "protein" / "variant" / "UniProt" / "Ensembl" / "PDB" - -**Read scientific/DATA_ANALYSIS.md when:** -- "machine learning" / "sklearn" / "statistics" / "plot" - -**Read scientific/CLINICAL.md when:** +- "machine learning" / "sklearn" / "statistics" - "clinical trial" / "PubMed" / "survival analysis" --- diff --git a/features/src/llm-context/skills/SCIENTIFIC_SKILLS_INDEX.md b/features/src/llm-context/skills/SCIENTIFIC_SKILLS_INDEX.md new file mode 100644 index 000000000..a4225c20a --- /dev/null +++ b/features/src/llm-context/skills/SCIENTIFIC_SKILLS_INDEX.md @@ -0,0 +1,50 @@ +# Scientific Skills Index + +**This file routes Claude to domain-specific scientific skills.** +Workbench skills (workflows, dashboards, custom apps) are handled directly by `CLAUDE.md`. + +--- + +## ⚡ Quick Navigation + +| User Says... | Read This Skill | +|--------------|-----------------| +| "single-cell" / "RNA-seq" / "scanpy" / "differential expression" | `scientific/BIOINFORMATICS.md` | +| "molecule" / "SMILES" / "drug" / "RDKit" / "ChEMBL" / "target" | `scientific/DRUG_DISCOVERY.md` | +| "gene" / "protein" / "variant" / "UniProt" / "Ensembl" / "PDB" | `scientific/GENOMICS_DATABASES.md` | +| "machine learning" / "sklearn" / "statistics" / "plot" | `scientific/DATA_ANALYSIS.md` | +| "clinical trial" / "PubMed" / "survival analysis" | `scientific/CLINICAL.md` | + +--- + +## Domain Skills + +### 🧬 Bioinformatics (`scientific/BIOINFORMATICS.md`) +Single-cell analysis, differential expression, sequence analysis, RNA velocity. +**Packages:** scanpy, anndata, biopython, pydeseq2, scvelo + +### 💊 Drug Discovery (`scientific/DRUG_DISCOVERY.md`) +Cheminformatics, molecular ML, bioactivity databases, target identification. +**Packages/APIs:** rdkit, deepchem, chembl, drugbank, opentargets + +### 🔬 Genomics Databases (`scientific/GENOMICS_DATABASES.md`) +Gene annotations, protein data, variant interpretation, 3D structures. +**APIs:** ensembl, uniprot, clinvar, pdb + +### 📊 Data Analysis (`scientific/DATA_ANALYSIS.md`) +Machine learning, statistics, visualization. +**Packages:** scikit-learn, statsmodels, plotly, seaborn + +### 🏥 Clinical (`scientific/CLINICAL.md`) +Clinical trials, literature search, survival analysis. +**APIs:** clinicaltrials.gov, pubmed + +--- + +## Adding New Skills + +To add skills from [claude-scientific-skills](https://github.com/K-Dense-AI/claude-scientific-skills): + +1. Copy the `SKILL.md` file to `scientific/.md` +2. Add a row to the Quick Navigation table above +3. Add a domain section below diff --git a/features/src/llm-context/skills/SKILL_INDEX.md b/features/src/llm-context/skills/SKILL_INDEX.md deleted file mode 100644 index 7e658d35b..000000000 --- a/features/src/llm-context/skills/SKILL_INDEX.md +++ /dev/null @@ -1,81 +0,0 @@ -# Skill Index - -**Read this file first to navigate available skills.** - ---- - -## ⚡ Quick Navigation - -| User Says... | Read This Skill | -|--------------|-----------------| -| "workflow failed" / "debug workflow" | `WORKFLOW_TROUBLESHOOT.md` | -| "create dashboard" / "visualize" / "Flask" | `DASHBOARD_BUILDER.md` | -| "create app" / "deploy app" | `CUSTOM_APP.md` | -| "single-cell" / "RNA-seq" / "scanpy" | `scientific/BIOINFORMATICS.md` | -| "molecule" / "drug" / "RDKit" / "ChEMBL" | `scientific/DRUG_DISCOVERY.md` | -| "gene" / "protein" / "variant" / "UniProt" | `scientific/GENOMICS_DATABASES.md` | -| "statistics" / "ML" / "plot" / "sklearn" | `scientific/DATA_ANALYSIS.md` | -| "clinical trial" / "PubMed" / "literature" | `scientific/CLINICAL.md` | - ---- - -## Workbench Skills - -Core skills for working within Verily Workbench: - -| Skill | File | Description | -|-------|------|-------------| -| **Workflow Troubleshooting** | `WORKFLOW_TROUBLESHOOT.md` | Debug failed WDL/Nextflow workflows | -| **Dashboard Builder** | `DASHBOARD_BUILDER.md` | Create web apps, Flask, Streamlit | -| **Custom App** | `CUSTOM_APP.md` | Build deployable Workbench apps | - ---- - -## Scientific Skills - -Domain-specific skills for pharma/biotech research: - -### 🧬 Bioinformatics (`scientific/BIOINFORMATICS.md`) -Single-cell analysis, differential expression, sequence analysis, RNA velocity. - -**Packages:** scanpy, anndata, biopython, pydeseq2, scvelo - -### 💊 Drug Discovery (`scientific/DRUG_DISCOVERY.md`) -Cheminformatics, molecular ML, bioactivity databases, target identification. - -**Packages/APIs:** rdkit, deepchem, chembl, drugbank, opentargets - -### 🔬 Genomics Databases (`scientific/GENOMICS_DATABASES.md`) -Gene annotations, protein data, variant interpretation, 3D structures. - -**APIs:** ensembl, uniprot, clinvar, pdb - -### 📊 Data Analysis (`scientific/DATA_ANALYSIS.md`) -Machine learning, statistics, visualization. - -**Packages:** scikit-learn, statsmodels, plotly, seaborn - -### 🏥 Clinical (`scientific/CLINICAL.md`) -Clinical trials, literature search, survival analysis. - -**APIs:** clinicaltrials.gov, pubmed - ---- - -## How to Use Skills - -1. **Claude reads this index first** when you ask a scientific question -2. **Claude then reads the relevant domain index** (e.g., `BIOINFORMATICS.md`) -3. **Domain indexes link to detailed skill files** when needed - -This hierarchy prevents context overload while ensuring Claude finds the right guidance. - ---- - -## Adding New Skills - -To add skills from [claude-scientific-skills](https://github.com/K-Dense-AI/claude-scientific-skills): - -1. Copy the `SKILL.md` file to `scientific/.md` -2. Add an entry to the relevant domain index -3. Update this index if adding a new category From c6909247e99d0bab114fdff7c50e1162f2b376eb Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 17 Mar 2026 16:20:48 -0400 Subject: [PATCH 43/86] Simplify CLAUDE.md template and skill files for clarity - Remove 'What is Verily Workbench?' and 'Best Practices' sections - Collapse three data discovery/query sections into one 'Data Discovery & Querying' - Consolidate MCP vs CLI + MCP Tools Available into single 'MCP Tools' section - Remove triple MCP-first banner repetition - Move Policies & Constraints under Key Concepts - Trim Data Persistence LLM Guidance to 2 bullets - Deduplicate Getting Help URLs - DASHBOARD_BUILDER: merge two checklists, tighten Flask config section - WORKFLOW_TROUBLESHOOT: collapse Step 2 into single command Made-with: Cursor --- features/src/llm-context/generate-context.sh | 250 +++++-------------- 1 file changed, 63 insertions(+), 187 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index e65391376..49fec188e 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -577,13 +577,11 @@ renderChart(data); ### Testing Checklist -Before deploying any web app: - -- [ ] **Relative paths** - All \`fetch()\` calls use \`'api/...'\` not \`'/api/...'\` -- [ ] **Test locally** - \`curl http://localhost:PORT/api/endpoint\` returns data -- [ ] **Server logs** - Verify API requests arrive: \`tail -f server.log\` -- [ ] **Browser DevTools** - Network tab shows 200 status for API calls -- [ ] **App UUID obtained** - Not using placeholder \`[APP_UUID]\` +Before deploying: +- [ ] All \`fetch()\` calls use relative paths (\`'api/...'\` not \`'/api/...'\`) +- [ ] Test locally: \`curl http://localhost:PORT/api/endpoint\` +- [ ] Server logs show API requests arriving +- [ ] App UUID obtained (not using placeholder \`[APP_UUID]\`) --- @@ -868,20 +866,7 @@ wb workflow job batch list --job= --format=json | jq '.[] | select(.stat ### Step 2: Get Job Details & Inputs \`\`\`bash -# Full job metadata -wb workflow job describe --job= --format=json -\`\`\` - -**Key fields to extract:** -\`\`\`bash -# Error message -wb workflow job describe --job= --format=json | jq -r '.failureMessage' - -# Inputs used -wb workflow job describe --job= --format=json | jq '.inputs' - -# Outputs (if any) -wb workflow job describe --job= --format=json | jq '.outputs' +wb workflow job describe --job= --format=json | jq '{failureMessage, inputs, outputs}' \`\`\` --- @@ -1609,33 +1594,6 @@ You are working inside **Verily Workbench**, a secure cloud-based research envir --- -## ⚡ MCP Tools First! - -> **Before running ANY CLI command, check if an MCP tool exists for the operation.** -> MCP tools return structured JSON and are faster than parsing CLI output. - -| Common Task | ✅ Use This MCP Tool | -|-------------|---------------------| -| List data collections | \`workspace_list_data_collections\` | -| List resources | \`workspace_list_resources\` | -| Resources by folder | \`resource_list_tree\` | -| Query BigQuery | \`bq_execute\` | -| List bucket files | \`list_files\` | - -**Skip to:** [Data Exploration Cheatsheet](#-data-exploration-cheatsheet) | [MCP Tools](#mcp-tools-available) - ---- - -## What is Verily Workbench? - -Verily Workbench is a platform that enables researchers to: -- Access and analyze biomedical data (clinical, genomics, wearables, imaging) -- Run computational workflows at scale (WDL, Nextflow) -- Collaborate securely with governance and policy enforcement -- Use familiar tools (Jupyter, RStudio, VS Code) in the cloud - ---- - ## Current Workspace | Property | Value | @@ -1703,6 +1661,14 @@ The response includes: ### Workflows Workflows are reproducible pipelines in WDL or Nextflow format, registered in the workspace. +### Policies & Constraints +Workspaces may have policies that restrict: +- **Region**: Where data and compute must reside +- **Groups**: Who can access the workspace +- **Export**: Whether data can leave the workspace + +Check with: \`wb workspace describe\` + --- ## ⚠️ Important: Data Persistence @@ -1757,33 +1723,21 @@ gs://your-bucket/ └── models/ # Trained ML models \`\`\` -### 🤖 LLM Guidance - -**As an AI assistant, you should proactively help users persist their work:** - -1. **When users create files locally**, ask: *"Would you like me to save this to a cloud bucket so it persists after the app stops?"* - -2. **When users finish analysis**, suggest: *"Your results are saved locally. Should I copy them to a bucket for long-term storage?"* - -3. **At session end**, remind: *"Remember to save any important local files to cloud storage before stopping the app."* +### LLM Guidance -4. **Check local disk usage** to identify files that need saving: - \`\`\`bash - du -sh ~/* - ls -la ~/ - \`\`\` +- **When users create files locally**, suggest saving to a bucket: \`gsutil cp gs:///\` +- **When users finish analysis**, remind: *"Save important outputs to cloud storage before stopping the app."* +- **List available buckets:** \`wb resource list --type=GCS_BUCKET --format=json\` --- -## 🔍 Data Exploration Cheatsheet +## Data Discovery & Querying -This is the **most important section** for quickly discovering and accessing data. +> **⚡ MCP FIRST:** Always check if an MCP tool exists before using CLI commands. -> **⚡ MCP FIRST:** Always check if an MCP tool exists before using CLI commands. MCP tools return structured data and are faster. +### Find Your Resources -### Step 1: Find Your Resources - -**🎯 Use MCP tools (preferred):** +**Use MCP tools (preferred):** | What You Need | MCP Tool | |---------------|----------| | Data collections + their resources | \`workspace_list_data_collections\` | @@ -1795,99 +1749,66 @@ This is the **most important section** for quickly discovering and accessing dat wb resource list --format=json | jq '.[] | {name: .id, type: .resourceType}' \`\`\` -### Step 2: Use Environment Variables (Easiest!) -Every resource is available as an environment variable: -\`\`\`bash -# Pattern: \$WORKBENCH_ -echo \$WORKBENCH_my_bucket # → gs://actual-bucket-name -env | grep WORKBENCH_ # List all -\`\`\` +### Get the Cloud Path for a Resource -### Step 3: Get Cloud Paths \`\`\`bash wb resource describe --format=json -# Look for: bucketName, projectId, datasetId, gitRepoUrl +# Look for: bucketName, projectId+datasetId, gitRepoUrl \`\`\` -### Step 4: Preview Data Quickly +### Use Environment Variables (Easiest) -**BigQuery:** \`\`\`bash -bq head -n 10 :.
# Quick preview -bq show --schema :.
# Column names/types -bq show --format=prettyjson :.
| jq '{rows: .numRows}' # Row count -\`\`\` - -**GCS:** -\`\`\`bash -gsutil ls gs:/// # List files -gsutil cat -r 0-1024 gs:///file.csv # Preview first 1KB +echo \$WORKBENCH_my_bucket # → gs://actual-bucket-name +env | grep WORKBENCH_ # List all \`\`\` -### 🤖 LLM Quick Patterns +### Preview Data -| User Question | Best Tool | Command/Tool | -|---------------|-----------|--------------| -| "What data collections do I have?" | **MCP** | \`workspace_list_data_collections\` | -| "What resources are in my workspace?" | **MCP** | \`workspace_list_resources\` | -| "Show resources by folder" | **MCP** | \`resource_list_tree\` | -| "Query this BigQuery table" | **MCP** | \`bq_execute\` | -| "What tables are in this dataset?" | CLI | \`bq ls :\` | -| "What columns in this table?" | CLI | \`bq show --schema :.
\` | -| "How big is this table?" | CLI | \`bq show --format=prettyjson ... \\| jq '{rows: .numRows}'\` | -| "Show me sample data" | CLI | \`bq head -n 5 :.
\` | -| "List files in bucket" | **MCP** | \`list_files\` | - -> **⚠️ Pattern to avoid:** Don't default to \`wb resource list\` for data collection questions. Use \`workspace_list_data_collections\` instead! - ---- - -## How to Discover Data (Detailed) - -### List Resources +**BigQuery:** \`\`\`bash -wb resource list -wb resource list --format=json -wb resource describe +bq head -n 10 :.
+bq show --schema :.
+bq query --use_legacy_sql=false 'SELECT * FROM \`project.dataset.table\` LIMIT 10' \`\`\` -### Explore GCS Buckets +**GCS:** \`\`\`bash gsutil ls gs:/// -gsutil ls -l gs:///path/ -gsutil cat gs:///path/file.txt +gsutil cat -r 0-1024 gs:///path/file.csv \`\`\` -### Explore BigQuery -\`\`\`bash -bq ls : -bq show :.
-bq query --use_legacy_sql=false 'SELECT * FROM \`project.dataset.table\` LIMIT 10' -\`\`\` +### Query Data ---- - -## How to Query Data - -### BigQuery (CLI) +**CLI:** \`\`\`bash -bq query --use_legacy_sql=false 'SELECT * FROM \`project.dataset.table\` LIMIT 100' +bq query --use_legacy_sql=false 'SELECT col1, col2 FROM \`project.dataset.table\` LIMIT 100' \`\`\` -### BigQuery (Python) +**Python:** \`\`\`python from google.cloud import bigquery client = bigquery.Client() -df = client.query("SELECT * FROM \\\`project.dataset.table\\\` LIMIT 100").to_dataframe() -\`\`\` +df = client.query("SELECT * FROM \`project.dataset.table\` LIMIT 100").to_dataframe() -### GCS Files (Python) -\`\`\`python import pandas as pd -df = pd.read_parquet('gs://bucket/path/file.parquet') -df = pd.read_csv('gs://bucket/path/file.csv') +df = pd.read_parquet('gs://bucket-name/path/file.parquet') \`\`\` +### LLM Quick Reference + +| User Question | Best Tool | Command/Tool | +|---------------|-----------|--------------| +| "What data collections do I have?" | **MCP** | \`workspace_list_data_collections\` | +| "What resources are in my workspace?" | **MCP** | \`workspace_list_resources\` | +| "Show resources by folder" | **MCP** | \`resource_list_tree\` | +| "Query this BigQuery table" | **MCP** | \`bq_execute\` | +| "What tables are in this dataset?" | CLI | \`bq ls :\` | +| "What columns in this table?" | CLI | \`bq show --schema :.
\` | +| "List files in bucket" | **MCP** | \`list_files\` | + +> **⚠️ Don't default to \`wb resource list\` for data collection questions. Use \`workspace_list_data_collections\` instead.** + --- ## How to Run Workflows @@ -1923,48 +1844,18 @@ wb resource add-ref gcs-bucket --name external-data --bucket-name existing-bucke --- -## MCP vs CLI: When to Use Each - -This app has **two interfaces** to Workbench functionality: - -| Interface | Best For | Pros | Cons | -|-----------|----------|------|------| -| **MCP Tools** | LLM operations | Structured responses, no shell needed, faster | Limited tool set | -| **CLI (\`wb\`)** | Complex operations, fallback | Full feature coverage, human-friendly | Requires shell execution, text parsing | - -### ⚠️ Common Operations — USE MCP, NOT CLI - -These operations have dedicated MCP tools. **Do NOT use CLI for these:** +## MCP Tools -| Operation | ✅ Use MCP Tool | ❌ Don't Use CLI | -|-----------|-----------------|------------------| -| List data collections | \`workspace_list_data_collections\` | ~~\`wb resource list\`~~ | -| List all resources | \`workspace_list_resources\` | ~~\`wb resource list\`~~ | -| Resources by folder | \`resource_list_tree\` | ~~\`wb resource list-tree\`~~ | -| Run BigQuery query | \`bq_execute\` | ~~\`bq query\`~~ | -| List bucket files | \`list_files\` | ~~\`gsutil ls\`~~ | +> **Always check MCP tools before running CLI commands. MCP tools return structured JSON and are faster.** -### 🤖 LLM Decision Guide +### When to Use Each -1. **ALWAYS check MCP tools first** — especially for list/query operations -2. **Fall back to CLI only** when MCP doesn't have the tool -3. **Use cloud CLIs** (\`gsutil\`, \`bq\`) only for operations MCP doesn't support +| Interface | Best For | +|-----------|----------| +| **MCP Tools** | List/query operations — structured responses, no shell needed | +| **CLI (\`wb\`)** | Complex operations or anything not covered by MCP | -### Example: Same Operation, Two Ways - -**List resources:** -- ✅ MCP: Use \`workspace_list_resources\` tool → returns JSON array -- ⚠️ CLI: Run \`wb resource list --format=json\` → requires shell, parsing - -**Query BigQuery:** -- ✅ MCP: Use \`bq_execute\` tool with query parameter → returns results -- ⚠️ CLI: Run \`bq query --use_legacy_sql=false 'SELECT ...'\` → requires parsing - ---- - -## MCP Tools Available - -The Workbench MCP server exposes these tools for programmatic LLM access: +### Available MCP Tools | MCP Tool | CLI Equivalent | Description | |----------|----------------|-------------| @@ -1980,12 +1871,7 @@ The Workbench MCP server exposes these tools for programmatic LLM access: | \`list_files\` | \`gsutil ls\` | List files in a GCS bucket | | \`read_file\` | \`gsutil cat\` | Read contents of a file | -**Not available via MCP (use CLI instead):** -- \`wb workspace set\` — switch workspaces -- \`wb auth login\` — re-authenticate -- \`wb workflow logs\` — view workflow logs -- \`wb resource delete\` — delete resources -- Complex resource creation with many options +**Not available via MCP (use CLI):** \`wb workspace set\`, \`wb auth login\`, \`wb workflow logs\`, \`wb resource delete\` --- @@ -2020,16 +1906,6 @@ wb app describe # App details --- -## Best Practices - -1. **Explore before acting**: Use \`LIMIT\` in queries, \`ls\` before copying -2. **Use environment variables**: \`\$WORKBENCH_\` for scripts -3. **Cost awareness**: Large queries and compute cost money -4. **Reproducibility**: Document analysis, version code -5. **Confirm destructive actions**: Check before deleting - ---- - ## ⚠️ Workbench Web Apps & Proxy URLs (CRITICAL) > **🚨 STOP! If user wants a dashboard, chart, Flask app, HTML page, or ANY web UI:** From 64197626460a042533819b06553fa6396450932e Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Fri, 27 Mar 2026 13:33:52 -0400 Subject: [PATCH 44/86] Reorganize CLAUDE.md: MCP tools to top, clean up Web Apps section, consolidate skill routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move MCP Tools + CLI Quick Reference to top of file (after Key Concepts) - Remove redundant Creating Custom Apps section — all nuanced content lives in CUSTOM_APP.md skill - Consolidate all skill routing into Available Skills section - Simplify Web Apps & Proxy URLs: clean alert, organized subsections (proxy URL, common ports, JS paths, wrong formats) - Add Custom Apps Guide and Devcontainer Reference URLs to Getting Help Made-with: Cursor --- features/src/llm-context/generate-context.sh | 202 ++++++++----------- 1 file changed, 82 insertions(+), 120 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 49fec188e..486f2f5f9 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -1731,7 +1731,64 @@ gs://your-bucket/ --- -## Data Discovery & Querying +## MCP Tools + +> **Always use MCP tools before falling back to CLI. MCP tools return structured JSON and are faster.** + +| Interface | Best For | +|-----------|----------| +| **MCP Tools** | List/query operations — structured responses, no shell needed | +| **CLI (\`wb\`)** | Complex operations or anything not covered by MCP | + +### Available MCP Tools + +| MCP Tool | CLI Equivalent | Description | +|----------|----------------|-------------| +| \`workspace_list_data_collections\` | N/A | **List data collections and their resources** | +| \`workspace_list_resources\` | \`wb resource list\` | List all resources in the workspace | +| \`resource_list_tree\` | \`wb resource list-tree\` | List resources organized by folder | +| \`bq_execute\` | \`bq query\` | Run SQL queries against BigQuery | +| \`workflow_job_run\` | \`wb workflow run\` | Submit a WDL/Nextflow workflow | +| \`get_workflow_status\` | \`wb workflow describe\` | Check status of a workflow run | +| \`build_cohort\` | *(UI only)* | Create a cohort using Data Explorer | +| \`export_cohort\` | *(UI only)* | Export cohort data to a bucket | +| \`create_bucket\` | \`wb resource create gcs-bucket\` | Create a new GCS bucket | +| \`list_files\` | \`gsutil ls\` | List files in a GCS bucket | +| \`read_file\` | \`gsutil cat\` | Read contents of a file | + +**Not available via MCP (use CLI):** \`wb workspace set\`, \`wb auth login\`, \`wb workflow logs\`, \`wb resource delete\` + +## CLI Quick Reference + +\`\`\`bash +# Workspace +wb workspace describe # Current workspace details +wb workspace list # All your workspaces +wb workspace set # Switch workspace + +# Resources +wb resource list # List resources +wb resource describe # Resource details +wb resource delete # Delete resource + +# Workflows +wb workflow list # List workflows +wb workflow run # Run workflow +wb workflow describe # Run status +wb workflow logs # Run logs + +# Apps +wb app list # List running apps +wb app describe # App details + +# Auth +wb auth status # Check authentication +wb auth login # Re-authenticate +\`\`\` + +--- + +## ⚠️ Important: Data Persistence > **⚡ MCP FIRST:** Always check if an MCP tool exists before using CLI commands. @@ -1844,146 +1901,49 @@ wb resource add-ref gcs-bucket --name external-data --bucket-name existing-bucke --- -## MCP Tools - -> **Always check MCP tools before running CLI commands. MCP tools return structured JSON and are faster.** - -### When to Use Each +## ⚠️ Workbench Web Apps & Proxy URLs -| Interface | Best For | -|-----------|----------| -| **MCP Tools** | List/query operations — structured responses, no shell needed | -| **CLI (\`wb\`)** | Complex operations or anything not covered by MCP | +> **🚨 If the user wants a dashboard, chart, Flask app, HTML page, or ANY web UI — read \`~/.workbench/skills/DASHBOARD_BUILDER.md\` first.** -### Available MCP Tools - -| MCP Tool | CLI Equivalent | Description | -|----------|----------------|-------------| -| \`workspace_list_data_collections\` | N/A | **List data collections and their resources** | -| \`workspace_list_resources\` | \`wb resource list\` | List all resources in the workspace | -| \`resource_list_tree\` | \`wb resource list-tree\` | List resources organized by folder | -| \`bq_execute\` | \`bq query\` | Run SQL queries against BigQuery | -| \`workflow_job_run\` | \`wb workflow run\` | Submit a WDL/Nextflow workflow | -| \`get_workflow_status\` | \`wb workflow describe\` | Check status of a workflow run | -| \`build_cohort\` | *(UI only)* | Create a cohort using Data Explorer | -| \`export_cohort\` | *(UI only)* | Export cohort data to a bucket | -| \`create_bucket\` | \`wb resource create gcs-bucket\` | Create a new GCS bucket | -| \`list_files\` | \`gsutil ls\` | List files in a GCS bucket | -| \`read_file\` | \`gsutil cat\` | Read contents of a file | - -**Not available via MCP (use CLI):** \`wb workspace set\`, \`wb auth login\`, \`wb workflow logs\`, \`wb resource delete\` - ---- - -## CLI Quick Reference - -\`\`\`bash -# Auth -wb auth status # Check authentication -wb auth login # Re-authenticate - -# Workspace -wb workspace describe # Current workspace details -wb workspace list # All your workspaces -wb workspace set # Switch workspace - -# Resources -wb resource list # List resources -wb resource list --format=json # JSON output -wb resource describe # Resource details -wb resource delete # Delete resource - -# Workflows -wb workflow list # List workflows -wb workflow run # Run workflow -wb workflow describe # Run status -wb workflow logs # Run logs - -# Apps -wb app list # List running apps -wb app describe # App details -\`\`\` - ---- - -## ⚠️ Workbench Web Apps & Proxy URLs (CRITICAL) - -> **🚨 STOP! If user wants a dashboard, chart, Flask app, HTML page, or ANY web UI:** -> **→ READ \`~/.workbench/skills/DASHBOARD_BUILDER.md\` FIRST!** -> -> That skill contains critical configuration, working templates, and troubleshooting for all interactive web content. - -### Quick Reference +### Proxy URL Format -**Proxy URL format (all web content):** \`\`\` https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] \`\`\` -**Get App UUID automatically (NEVER ask user for it):** +**Get App UUID automatically — NEVER ask the user for it:** \`\`\`bash wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 \`\`\` -### ⚠️ JavaScript Relative Paths (Critical for Dashboards) +### Common Ports -**All fetch() calls in JavaScript MUST use relative paths:** -\`\`\`javascript -// ✅ CORRECT - works through Workbench proxy -fetch('api/data') +| Content Type | Port | +|--------------|------| +| Flask/FastAPI | 8080 | +| Streamlit | 8501 | +| Static HTML | 8000 | +| R Shiny | 3838 | -// ❌ WRONG - absolute path breaks through proxy (404 error!) -fetch('/api/data') -\`\`\` +### ⚠️ JavaScript: Always Use Relative Paths -**Why:** \`fetch('/api/data')\` resolves to \`workbench.verily.com/api/data\` (wrong!) -**Should be:** \`workbench.verily.com/app/UUID/proxy/PORT/api/data\` +All \`fetch()\` calls in JavaScript **must** use relative paths (no leading \`/\`): -### Common Ports -| Content Type | Port | Example Command | -|--------------|------|-----------------| -| Flask/FastAPI | 8080 | \`flask run --port 8080\` | -| Streamlit | 8501 | \`streamlit run app.py\` | -| Static HTML | 8000 | \`python3 -m http.server 8000\` | -| R Shiny | 3838 | (configured in app) | +\`\`\`javascript +fetch('api/data') // ✅ resolves to workbench.verily.com/app/UUID/proxy/8080/api/data +fetch('/api/data') // ❌ resolves to workbench.verily.com/api/data — 404! +\`\`\` ### ❌ Wrong URL Formats + \`\`\` -https://UUID.workbench-app.verily.com/ ← Bad Request error -http://localhost:8080/ ← Not accessible externally -file:///home/jupyter/dashboard.html ← JavaScript blocked +https://UUID.workbench-app.verily.com/ ← Bad Request error +http://localhost:8080/ ← Not accessible externally +file:///home/jupyter/dashboard.html ← JavaScript blocked \`\`\` --- -## Creating Custom Apps - -> **When a user asks to create an app, turn code into an app, or build something deployable:** - -### Step 1: Determine the Type - -| User Wants... | Read This Skill | -|---------------|-----------------| -| Dashboard, visualization, Flask app, web UI | \`DASHBOARD_BUILDER.md\` | -| Deployable custom app from scratch | \`CUSTOM_APP.md\` | - -### Step 2: Use the Appropriate Skill - -**For dashboards/web UIs** → \`~/.workbench/skills/DASHBOARD_BUILDER.md\` -- Working Flask templates with BigQuery -- Critical proxy URL configuration -- Tested troubleshooting guides - -**For deployable apps** → \`~/.workbench/skills/CUSTOM_APP.md\` -- Minimal devcontainer pattern -- Docker configuration -- Deployment checklist - -### Quick Reference -- **Full-featured apps**: https://github.com/verily-src/workbench-app-devcontainers - ---- - ## Available Skills ### Workbench Skills @@ -2061,7 +2021,9 @@ To refresh after workspace changes: ## Getting Help - **Docs**: https://support.workbench.verily.com -- **Custom Apps**: https://github.com/verily-src/workbench-app-devcontainers +- **Custom Apps Guide**: https://support.workbench.verily.com/docs/guides/cloud_apps/create_custom_apps/ +- **Devcontainers Repo**: https://github.com/verily-src/workbench-app-devcontainers +- **Devcontainer Reference**: https://containers.dev/implementors/json_reference/ - **CLI Help**: \`wb --help\` or \`wb --help\` - **Support**: support@workbench.verily.com From bafc4fb353ca5d10edf8a94eaca26d278ad83fd7 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Fri, 27 Mar 2026 14:03:37 -0400 Subject: [PATCH 45/86] Expand MCP tools list, simplify Data Persistence and Proxy URL sections - Rename MCP section to "Most Commonly Used MCP Tools" - Add Data Explorer, Apps & Workflows, Cloud CLI tool categories - Correct stale tool names (build_cohort, create_bucket) - Collapse Data Persistence section to concise LLM guidance only - Clean up Proxy URL section: remove directive-style language Made-with: Cursor --- features/src/llm-context/generate-context.sh | 118 ++++++++----------- 1 file changed, 52 insertions(+), 66 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 486f2f5f9..165cc50cd 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -480,17 +480,13 @@ TEMPLATES_SKILL_EOF ### Proxy URL Format -All web apps in Workbench are accessed via: +The proxy URL is the **only valid way** to access web apps in Workbench: \`\`\` https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] \`\`\` -### ⚠️ How to Get the App UUID (CRITICAL) - -**You MUST automatically get the app UUID - NEVER ask the user for it.** - +Retrieve the App UUID automatically: \`\`\`bash -# Run this command and use the output: wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 \`\`\` @@ -1673,65 +1669,17 @@ Check with: \`wb workspace describe\` ## ⚠️ Important: Data Persistence -> **LOCAL STORAGE IS EPHEMERAL.** Files saved on this app's local disk will be **lost** when the app stops or restarts. - -### The Problem -Users often create files locally (notebooks, scripts, outputs) without realizing they won't persist. When the app shuts down, all local work is lost. - -### The Solution -**Save important files to a cloud bucket** in your workspace. Buckets persist independently of apps. - -### Available Storage Buckets in This Workspace - -${bucket_list} - -### How to Save Local Files to Cloud Storage - -\`\`\`bash -# Copy a single file to a bucket -gsutil cp my-notebook.ipynb gs:///notebooks/ - -# Copy a directory recursively -gsutil -m cp -r ./my-results/ gs:///results/ - -# Sync a local directory (only copies changed files) -gsutil -m rsync -r ./my-project/ gs:///projects/my-project/ -\`\`\` - -### In Python -\`\`\`python -# Save dataframes directly to GCS -df.to_parquet('gs://bucket-name/path/output.parquet') -df.to_csv('gs://bucket-name/path/output.csv') - -# Or use storage client -from google.cloud import storage -client = storage.Client() -bucket = client.bucket('bucket-name') -blob = bucket.blob('path/file.csv') -blob.upload_from_filename('local-file.csv') -\`\`\` - -### Suggested Bucket Organization -\`\`\` -gs://your-bucket/ -├── notebooks/ # Jupyter notebooks -├── scripts/ # Python/R scripts -├── data/raw/ # Input data -├── data/processed/ # Cleaned data -├── results/ # Analysis outputs -└── models/ # Trained ML models -\`\`\` - -### LLM Guidance +Local app storage is ephemeral — files saved to the app's local disk are **lost when the app stops or restarts**. Always encourage users to save important work to a GCS bucket in their workspace. - **When users create files locally**, suggest saving to a bucket: \`gsutil cp gs:///\` - **When users finish analysis**, remind: *"Save important outputs to cloud storage before stopping the app."* -- **List available buckets:** \`wb resource list --type=GCS_BUCKET --format=json\` +- **Available buckets in this workspace:** + +${bucket_list} --- -## MCP Tools +## Most Commonly Used MCP Tools > **Always use MCP tools before falling back to CLI. MCP tools return structured JSON and are faster.** @@ -1740,7 +1688,7 @@ gs://your-bucket/ | **MCP Tools** | List/query operations — structured responses, no shell needed | | **CLI (\`wb\`)** | Complex operations or anything not covered by MCP | -### Available MCP Tools +### Data & Resources | MCP Tool | CLI Equivalent | Description | |----------|----------------|-------------| @@ -1748,13 +1696,50 @@ gs://your-bucket/ | \`workspace_list_resources\` | \`wb resource list\` | List all resources in the workspace | | \`resource_list_tree\` | \`wb resource list-tree\` | List resources organized by folder | | \`bq_execute\` | \`bq query\` | Run SQL queries against BigQuery | -| \`workflow_job_run\` | \`wb workflow run\` | Submit a WDL/Nextflow workflow | -| \`get_workflow_status\` | \`wb workflow describe\` | Check status of a workflow run | -| \`build_cohort\` | *(UI only)* | Create a cohort using Data Explorer | -| \`export_cohort\` | *(UI only)* | Export cohort data to a bucket | -| \`create_bucket\` | \`wb resource create gcs-bucket\` | Create a new GCS bucket | | \`list_files\` | \`gsutil ls\` | List files in a GCS bucket | | \`read_file\` | \`gsutil cat\` | Read contents of a file | +| \`resource_create_bucket\` | \`wb resource create gcs-bucket\` | Create a new GCS bucket | +| \`resource_delete\` | \`wb resource delete\` | Delete a resource | +| \`resource_check_access\` | — | Check if service account has access to a resource | +| \`resource_mount\` / \`resource_unmount\` | — | Mount/unmount a GCS bucket | + +### Apps & Workflows + +| MCP Tool | CLI Equivalent | Description | +|----------|----------------|-------------| +| \`app_list\` | \`wb app list\` | List running apps | +| \`app_create\` | \`wb app create\` | Create a new custom app | +| \`app_get_url\` | — | Get the proxy URL for a running app | +| \`app_start\` / \`app_stop\` | \`wb app start/stop\` | Start or stop an app | +| \`workflow_list\` | \`wb workflow list\` | List available workflows | +| \`workflow_job_run\` | \`wb workflow run\` | Submit a WDL/Nextflow workflow | +| \`workflow_job_list\` | \`wb workflow job list\` | List workflow job runs | +| \`workflow_job_describe\` | \`wb workflow job describe\` | Get details of a specific job run | +| \`workflow_job_cancel\` | \`wb workflow job cancel\` | Cancel a running job | +| \`get_workflow_status\` | \`wb workflow describe\` | Check status of a workflow run | + +### Data Explorer + +| MCP Tool | Description | +|----------|-------------| +| \`underlay_list\` | List available data underlays (datasets in the Data Explorer catalog) | +| \`underlay_get_schema\` | Get the schema for a specific underlay | +| \`underlay_list_entities\` | List entity types in an underlay (e.g. person, condition) | +| \`data_sample_instances\` | Sample rows from an entity within a cohort | +| \`data_query_hints\` | Get value hints for filtering an entity attribute | +| \`study_list\` | List studies available in Data Explorer | +| \`study_list_cohorts\` | List cohorts within a study | +| \`cohort_create_in_workspace\` | Create a cohort in the workspace | +| \`cohort_count_instances\` | Count members in a cohort | +| \`export_cohort\` | Export cohort data to a bucket | + +### Cloud CLIs (via MCP) + +| MCP Tool | Description | +|----------|-------------| +| \`gcloud_execute\` | Run any \`gcloud\` command | +| \`gsutil_execute\` | Run any \`gsutil\` command | +| \`bq_execute\` | Run any \`bq\` SQL query | **Not available via MCP (use CLI):** \`wb workspace set\`, \`wb auth login\`, \`wb workflow logs\`, \`wb resource delete\` @@ -1907,11 +1892,12 @@ wb resource add-ref gcs-bucket --name external-data --bucket-name existing-bucke ### Proxy URL Format +The proxy URL is the **only valid way** to access web apps in Workbench: \`\`\` https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] \`\`\` -**Get App UUID automatically — NEVER ask the user for it:** +Retrieve the App UUID automatically: \`\`\`bash wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 \`\`\` From b449396911f59020b3abff59687aa60b704c933c Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Fri, 27 Mar 2026 14:12:15 -0400 Subject: [PATCH 46/86] Streamline skill files: remove directives, duplicates, and redundant sections - DASHBOARD_BUILDER: remove LLM INSTRUCTION box, directive language, and redundant 'Why Absolute Paths Fail' block - WORKFLOW_TROUBLESHOOT: remove Essential Commands (duplicate of Quick Diagnosis) - APP_TEMPLATES: remove redundant Summary table Made-with: Cursor --- features/src/llm-context/generate-context.sh | 37 -------------------- 1 file changed, 37 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 165cc50cd..292ccb9ec 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -490,12 +490,6 @@ Retrieve the App UUID automatically: wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 \`\`\` -**⚡ LLM INSTRUCTION:** When constructing dashboard/proxy URLs: -1. First run the command above to get the running app UUID -2. Use that actual UUID in the URL you provide -3. Do NOT use placeholders like \`[APP_UUID]\` in your final response -4. Do NOT ask the user to find/replace the UUID themselves - ### ✅ Correct URL Examples \`\`\` https://workbench.verily.com/app/abc123-def456-789/proxy/8080/ @@ -536,18 +530,6 @@ fetch('/api/metadata') fetch('/api/data?filter=value') \`\`\` -### Why Absolute Paths Fail - -\`\`\` -User visits: https://workbench.verily.com/app/UUID/proxy/8080/ - -Absolute path: fetch('/api/data') - → Browser resolves to: https://workbench.verily.com/api/data ❌ (404!) - -Relative path: fetch('api/data') - → Browser resolves to: https://workbench.verily.com/app/UUID/proxy/8080/api/data ✅ -\`\`\` - ### Alternative: Embed Data in HTML (For Static Dashboards) If you don't need dynamic filtering, embed data directly in the template: @@ -1072,25 +1054,6 @@ wb workflow job run --workflow= --inputs= ## Quick Reference -### Essential Commands - -\`\`\`bash -# Failed jobs -wb workflow job list --format=json | jq '.[] | select(.status=="FAILED") | {id, workflowName}' - -# Job error -wb workflow job describe --job= --format=json | jq '.failureMessage' - -# Failed tasks -wb workflow job task list --job= --format=json | jq '.[] | select(.status=="FAILED") | .name' - -# Task logs -wb workflow job task describe --job= --task= --format=json | jq '.stderr' | xargs -I{} gsutil cat {} | tail -50 - -# Memory check -gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource' -\`\`\` - ### Error → Cause → Fix | Exit Code | Meaning | Common Fix | From cadd198dc73bfdaf58563bc6a70ff5f780aefc82 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 2 Apr 2026 12:20:54 -0400 Subject: [PATCH 47/86] Move context files to ~/.claude/ for native Claude Code auto-discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Change CONTEXT_DIR from ~/.workbench to ~/.claude - CLAUDE.md and skills now live at ~/.claude/ and ~/.claude/skills/ - Removes symlink entirely — ~/.claude/CLAUDE.md is natively discovered by Claude Code - Update all skill path references from ~/.workbench/skills/ to ~/.claude/skills/ Made-with: Cursor --- features/src/llm-context/generate-context.sh | 21 +++++++------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 292ccb9ec..a9487d9fd 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -49,11 +49,9 @@ set -e # Configuration -CONTEXT_DIR="${HOME}/.workbench" +CONTEXT_DIR="${HOME}/.claude" SKILLS_DIR="${CONTEXT_DIR}/skills" CLAUDE_FILE="${CONTEXT_DIR}/CLAUDE.md" -# Visible symlink in home directory for Claude Code auto-discovery -VISIBLE_CLAUDE_SYMLINK="${HOME}/CLAUDE.md" # Colors for output RED='\033[0;31m' @@ -458,7 +456,7 @@ bucket <- Sys.getenv("WORKBENCH_my_bucket") If no template matches: 1. Check if a template can be extended (usually yes) -2. If truly custom, read `~/.workbench/skills/CUSTOM_APP.md` +2. If truly custom, read `~/.claude/skills/CUSTOM_APP.md` TEMPLATES_SKILL_EOF # Create DASHBOARD_BUILDER.md skill (full version, embedded) @@ -1851,7 +1849,7 @@ wb resource add-ref gcs-bucket --name external-data --bucket-name existing-bucke ## ⚠️ Workbench Web Apps & Proxy URLs -> **🚨 If the user wants a dashboard, chart, Flask app, HTML page, or ANY web UI — read \`~/.workbench/skills/DASHBOARD_BUILDER.md\` first.** +> **🚨 If the user wants a dashboard, chart, Flask app, HTML page, or ANY web UI — read \`~/.claude/skills/DASHBOARD_BUILDER.md\` first.** ### Proxy URL Format @@ -1907,7 +1905,7 @@ Read these directly — no index needed: ### Scientific Skills -> **📚 Read \`~/.workbench/skills/SCIENTIFIC_SKILLS_INDEX.md\` first** to navigate scientific domain skills. +> **📚 Read \`~/.claude/skills/SCIENTIFIC_SKILLS_INDEX.md\` first** to navigate scientific domain skills. | Domain | Skill File | Covers | |--------|------------|--------| @@ -1962,7 +1960,7 @@ ${embedded_json} To refresh after workspace changes: \`\`\`bash -~/.workbench/generate-context.sh +~/.claude/generate-context.sh \`\`\` --- @@ -2004,20 +2002,15 @@ main() { # Generate single CLAUDE.md file with embedded JSON generate_claude_md "$WORKSPACE" "$RESOURCES" "$WORKFLOWS" "$APPS" - - # Create visible symlink in home directory for Claude Code auto-discovery - ln -sf "${CLAUDE_FILE}" "${VISIBLE_CLAUDE_SYMLINK}" - log_info "Created symlink ~/CLAUDE.md → ${CLAUDE_FILE}" - + echo "" >&2 log_info "Context generation complete!" echo "" >&2 echo "Generated file:" >&2 echo " - ${CLAUDE_FILE}" >&2 - echo " - ~/CLAUDE.md (symlink for auto-discovery)" >&2 echo "" >&2 echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >&2 - echo "✅ Claude Code will automatically discover ~/CLAUDE.md" >&2 + echo "✅ Claude Code will automatically discover ~/.claude/CLAUDE.md" >&2 echo "" >&2 echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >&2 echo "" >&2 From ef9fbde074470389a2f8253562a4189750571b90 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 22 Apr 2026 16:34:10 -0400 Subject: [PATCH 48/86] Update CUSTOM_APP skill with proxy and path gotchas - Add relative path rules for fetch() and to Critical Requirements - Add url_for() warning alongside relative path rules - Add STRICT_SLASHES = False to Flask example - Add volume mount warning: local dev only, production requires COPY - Add 3 missing error rows: 308 redirect, 404 on API calls, pip install failure Made-with: Cursor --- features/src/llm-context/skills/CUSTOM_APP.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/features/src/llm-context/skills/CUSTOM_APP.md b/features/src/llm-context/skills/CUSTOM_APP.md index f1e7171a4..4e592c7d1 100644 --- a/features/src/llm-context/skills/CUSTOM_APP.md +++ b/features/src/llm-context/skills/CUSTOM_APP.md @@ -63,6 +63,7 @@ networks: ``` **Alternative: Use image directly (no Dockerfile):** +> ⚠️ The `volumes` mount below is for local dev only. In production, Workbench builds the image — code must be baked in via `COPY` in the Dockerfile. Do not rely on volume mounts for deployed apps. ```yaml services: app: @@ -176,6 +177,9 @@ For apps needing `wb` CLI, bucket mounting, gcloud auth. - [ ] `container_name: "application-server"` - [ ] `networks: app-network` with `external: true` - [ ] Server binds to `0.0.0.0` (not `localhost`) +- [ ] All `fetch()` calls use relative paths — `fetch('api/data')` ✅ not `fetch('/api/data')` ❌ +- [ ] All `` and `` use relative paths — leading `/` routes to `workbench.verily.com`, causing 404s +- [ ] Do not use `url_for()` for frontend-facing links — generates wrong paths behind the proxy --- @@ -199,6 +203,7 @@ from flask import Flask from flask_cors import CORS app = Flask(__name__) +app.config['STRICT_SLASHES'] = False # Prevents 308 redirects behind the proxy CORS(app) @app.route('/') @@ -275,6 +280,9 @@ All examples are from the official repo: [verily-src/workbench-app-devcontainers | No container created | Check Workbench logs, GitHub access | | Container restart loop | App crashes on startup (check `docker logs`) | | "Bad Request" | Wrong URL format | +| 308 redirect loop | Missing `app.config['STRICT_SLASHES'] = False` on Flask app | +| 404 on API calls | Leading `/` in `fetch()` path — use `fetch('api/data')` not `fetch('/api/data')` | +| Build fails on pip install | Unpinned dependencies — pin versions in `requirements.txt` | --- From 1a994c82af846f2dcf063891fc2e27a765ed4e66 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 23 Apr 2026 11:26:44 -0400 Subject: [PATCH 49/86] Tighten WORKFLOW_TROUBLESHOOT behavior section Made-with: Cursor --- .../src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md index 213e2d7c5..8b76108d0 100644 --- a/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md +++ b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md @@ -2,17 +2,9 @@ **Trigger:** User asks to troubleshoot, debug, or fix a failed workflow. -## ⚡ LLM Behavior: Be Proactive! +## Behavior -**Once the user confirms which job to investigate, DO NOT ask which diagnostic steps to run.** Instead: -1. **Run all diagnostic commands automatically** (Steps 2-4 at minimum) -2. **Analyze the results** and identify the root cause -3. **Report your diagnosis** with evidence (error messages, exit codes, log snippets) -4. **Propose a fix** with specific changes -5. **THEN ask** if they want you to apply the fix or investigate further - -❌ Don't say: "Would you like me to check the logs?" -✅ Do say: "I checked the logs and found an OOM error. The task requested 8GB but needed more. I recommend increasing memory to 16GB in the runtime block." +Once the target job is identified, immediately run all relevant diagnostic commands (Steps 2–4 at minimum) without waiting for further instruction. Collect the error message, failed task, logs, and exit code. Analyze the results, identify the root cause, and present a diagnosis with supporting evidence. Then propose a specific fix. --- From 5060ec726df6d1b1d448927ac95e95385ffd0d15 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 23 Apr 2026 11:36:04 -0400 Subject: [PATCH 50/86] Reformat behavior section as numbered list; add open-ended fallback to Step 7 Made-with: Cursor --- features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md index 8b76108d0..c9bde06f9 100644 --- a/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md +++ b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md @@ -4,7 +4,12 @@ ## Behavior -Once the target job is identified, immediately run all relevant diagnostic commands (Steps 2–4 at minimum) without waiting for further instruction. Collect the error message, failed task, logs, and exit code. Analyze the results, identify the root cause, and present a diagnosis with supporting evidence. Then propose a specific fix. +Once the target job is identified: +1. Run all diagnostic commands (Steps 2–4) without waiting for further instruction +2. Collect error message, failed task name, logs, and exit code +3. Identify the root cause from the evidence +4. Present the diagnosis with supporting log snippets or error output +5. Propose a specific fix --- @@ -263,6 +268,7 @@ Based on diagnosis, recommend one of: | **Permission** | "Service account lacks access. Grant `roles/storage.objectViewer` on bucket" | | **Timeout** | "Task exceeded time limit. Increase `maxRetries` or optimize task" | | **Docker** | "Image pull failed. Verify image exists and is accessible" | +| **Other** | Describe the root cause from logs and propose a fix based on the specific error | **Re-run after fixing:** ```bash From 0a533e741092ae9232ddae20f61de008f4488f44 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Mon, 27 Apr 2026 14:45:05 -0400 Subject: [PATCH 51/86] refine CLAUDE.md: fix headings, data collections, skills, remove duplicates - Fix duplicate section heading: rename second Data Persistence to Data Discovery & Querying - Rewrite Data Collections: preserve concept, data types, policy note, and MCP fallback path; remove verbose resourceLineage JSON schema - Remove duplicate LLM Quick Reference table from Data Discovery section - Add APP_TEMPLATES.md to skills table and trigger guide - Remove resource_delete from Not available via MCP note (it is available via MCP) Made-with: Cursor --- features/src/llm-context/generate-context.sh | 48 +++++--------------- 1 file changed, 12 insertions(+), 36 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index a9487d9fd..3799da7ca 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -1592,28 +1592,13 @@ Resources are cloud assets managed by Workbench: **Environment Variables**: Each resource is available as \`\$WORKBENCH_\` (e.g., \`\$WORKBENCH_my_bucket\`). ### Data Collections -Data collections are curated datasets in the Workbench catalog. When added to a workspace, their resources are cloned into **folders**. +Curated datasets published to the Workbench catalog. When added to a workspace, their resources are cloned as **folders** — they may look like user-created resources but originated externally. Common types include clinical data (OMOP, FHIR), genomics (VCF, BAM), and wearables. -#### Identifying Resources from Data Collections +Data collections can carry **policies** that restrict how their data is used (region, export controls, access groups). -Use the **MCP server** to find which data collection a resource came from: - -1. **Use the MCP \`workspace_list_data_collections\` tool** to get resources grouped by data collection -2. Or use \`workspace_list_resources\` with workspaceId to get full resource metadata -3. The \`resourceLineage\` object contains: - - \`sourceWorkspaceId\`: UUID of the data collection - - \`sourceResourceId\`: UUID of the original resource - -**Example:** Ask "Use workspace_list_data_collections to show me which data collections my resources came from" - -The response includes: -\`\`\`json -{ - "resourceLineage": [ - { "sourceWorkspaceId": "abc123-...", "sourceResourceId": "def456-..." } - ] -} -\`\`\` +**To identify resources from data collections:** +1. Use \`workspace_list_data_collections\` — groups resources by source collection (preferred) +2. Or use \`workspace_list_resources\` with \`workspaceId\` — returns full resource metadata including \`resourceLineage\`, which contains the source collection ID and original resource ID ### Workflows Workflows are reproducible pipelines in WDL or Nextflow format, registered in the workspace. @@ -1702,7 +1687,7 @@ ${bucket_list} | \`gsutil_execute\` | Run any \`gsutil\` command | | \`bq_execute\` | Run any \`bq\` SQL query | -**Not available via MCP (use CLI):** \`wb workspace set\`, \`wb auth login\`, \`wb workflow logs\`, \`wb resource delete\` +**Not available via MCP (use CLI):** \`wb workspace set\`, \`wb auth login\`, \`wb workflow logs\` ## CLI Quick Reference @@ -1734,7 +1719,7 @@ wb auth login # Re-authenticate --- -## ⚠️ Important: Data Persistence +## Data Discovery & Querying > **⚡ MCP FIRST:** Always check if an MCP tool exists before using CLI commands. @@ -1798,20 +1783,6 @@ import pandas as pd df = pd.read_parquet('gs://bucket-name/path/file.parquet') \`\`\` -### LLM Quick Reference - -| User Question | Best Tool | Command/Tool | -|---------------|-----------|--------------| -| "What data collections do I have?" | **MCP** | \`workspace_list_data_collections\` | -| "What resources are in my workspace?" | **MCP** | \`workspace_list_resources\` | -| "Show resources by folder" | **MCP** | \`resource_list_tree\` | -| "Query this BigQuery table" | **MCP** | \`bq_execute\` | -| "What tables are in this dataset?" | CLI | \`bq ls :\` | -| "What columns in this table?" | CLI | \`bq show --schema :.
\` | -| "List files in bucket" | **MCP** | \`list_files\` | - -> **⚠️ Don't default to \`wb resource list\` for data collection questions. Use \`workspace_list_data_collections\` instead.** - --- ## How to Run Workflows @@ -1901,6 +1872,7 @@ Read these directly — no index needed: |-------|------------|-------------| | **🚨 Dashboards, Web UIs** | \`DASHBOARD_BUILDER.md\` | Dashboard, Flask, Streamlit, web UI, plots on a port | | Building custom apps | \`CUSTOM_APP.md\` | Deployable Workbench apps | +| App templates | \`APP_TEMPLATES.md\` | Pre-built templates for dashboards, APIs, file processors | | **Workflow debugging** | \`WORKFLOW_TROUBLESHOOT.md\` | Failed WDL/Nextflow, logs, memory/disk issues | ### Scientific Skills @@ -1930,6 +1902,10 @@ Read these directly — no index needed: - "build a deployable app" / "create a custom app" - "API service" / "backend" / "from scratch" +**Read \`APP_TEMPLATES.md\` when:** +- "dashboard template" / "starter template" / "pre-built app" +- "what templates are available" / "which template should I use" + **Read \`WORKFLOW_TROUBLESHOOT.md\` when:** - "troubleshoot my workflow" / "fix my workflow" - "my workflow failed" / "workflow error" / "debug workflow" From 66e73260a985af692d101dde2aad0ce97cd51cf1 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Mon, 27 Apr 2026 15:47:02 -0400 Subject: [PATCH 52/86] align APP_TEMPLATES.md with CUSTOM_APP.md best practices - Add .devcontainer.json at ROOT to deployment checklist - Add proxy/path requirements to checklist: relative fetch(), href, no url_for() - Add volume mount warning to Option 2 (local dev only, not production) - Add STRICT_SLASHES note to Flask endpoint customization example - Add Common Errors table: 308 redirect, 404 on API, pip install, volume mount, container restart Made-with: Cursor --- .../src/llm-context/skills/APP_TEMPLATES.md | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/features/src/llm-context/skills/APP_TEMPLATES.md b/features/src/llm-context/skills/APP_TEMPLATES.md index 1cb04722c..bc7237b95 100644 --- a/features/src/llm-context/skills/APP_TEMPLATES.md +++ b/features/src/llm-context/skills/APP_TEMPLATES.md @@ -78,6 +78,8 @@ Folder: src/templates/ 4. Push to GitHub 5. Deploy from user's repo +> ⚠️ Volume mounts (`volumes: .:/workspace`) are for local dev only. In production, Workbench builds the image — code must be baked in via `COPY` in the Dockerfile. Do not rely on volume mounts for deployed apps. + --- ## Template Details @@ -214,6 +216,7 @@ If the user's requirements don't match any template: ### Add a new endpoint (Flask) ```python +# app.config['STRICT_SLASHES'] = False should already be set in the template — do not remove it @app.route("/my-endpoint", methods=["POST"]) def my_endpoint(): data = request.get_json() @@ -242,11 +245,28 @@ RUN R -e "install.packages(c('existingpkgs', 'newpackage'))" Before deploying any template: +- [ ] `.devcontainer.json` at repo ROOT (not in a subfolder) - [ ] Container name is `application-server` - [ ] Network is `app-network` with `external: true` - [ ] Port is exposed and mapped correctly - [ ] `devcontainer-template.json` has unique `id` - [ ] Application binds to `0.0.0.0` (not `localhost`) +- [ ] All `fetch()` calls use relative paths — `fetch('api/data')` ✅ not `fetch('/api/data')` ❌ +- [ ] All `` and `` use relative paths — leading `/` routes to `workbench.verily.com`, causing 404s +- [ ] Do not use `url_for()` for frontend-facing links — generates wrong paths behind the proxy + +--- + +## Common Errors + +| Error | Cause | Fix | +|-------|-------|-----| +| App fails to create | `.devcontainer.json` not at repo root | Move to repo root | +| 308 redirect loop | Flask missing `STRICT_SLASHES` setting | Add `app.config['STRICT_SLASHES'] = False` | +| 404 on API calls | Leading `/` in `fetch()` path | Use `fetch('api/data')` not `fetch('/api/data')` | +| Build fails on pip install | Unpinned dependencies | Pin versions in `requirements.txt` | +| App works locally but not deployed | Volume mount used instead of `COPY` | Bake code into image via Dockerfile `COPY` | +| Container restart loop | App crashes on startup | Check `docker logs application-server` | --- From 71850de8f0d8d841c6a18fcce13d6f203e441bd4 Mon Sep 17 00:00:00 2001 From: Navid Zolghadr Date: Fri, 8 May 2026 11:05:02 -0400 Subject: [PATCH 53/86] Make mcp server run as a local host server and use http as transport method instead of stdio for gemini and claude --- features/src/wb-mcp-server/README.md | 61 ++++++++++--- .../wb-mcp-server/devcontainer-feature.json | 11 +-- features/src/wb-mcp-server/install.sh | 88 ++++++++++++------- features/src/wb-mcp-server/main.go | 75 ++++++++++++++-- 4 files changed, 179 insertions(+), 56 deletions(-) diff --git a/features/src/wb-mcp-server/README.md b/features/src/wb-mcp-server/README.md index 410226b04..6d464616f 100644 --- a/features/src/wb-mcp-server/README.md +++ b/features/src/wb-mcp-server/README.md @@ -14,20 +14,45 @@ Add to your `devcontainer.json`: } ``` -Rebuild your devcontainer. The server installs at `/opt/wb-mcp-server/wb-mcp-server`. +Rebuild your devcontainer. The server: +- Installs at `/opt/wb-mcp-server/wb-mcp-server` +- **Runs automatically as HTTP daemon** on port 9242 +- **Auto-configures Claude CLI and Gemini CLI** during installation -## Setup +## How It Works + +The server runs in **HTTP mode** as a persistent background service: + +- **No lazy loading** - tools are available immediately +- **Port 9242** - uncommon port to avoid conflicts +- **Starts via postStartCommand** after authentication completes +- **Pre-configured** with both Claude Code and Gemini CLI + +### Manual Setup (if needed) + +If auto-configuration failed, manually add the server: -### With Claude CLI +**Claude CLI:** +```bash +claude mcp add --transport http wb http://127.0.0.1:9242 +``` +**Gemini CLI:** ```bash -claude mcp add --transport stdio wb -- /opt/wb-mcp-server/wb-mcp-server +gemini mcp add --scope user --transport http wb http://127.0.0.1:9242 ``` -### With Gemini CLI +### Server Control ```bash -gemini mcp add --scope user wb /opt/wb-mcp-server/wb-mcp-server +# Start server +/opt/wb-mcp-server/start-server.sh + +# Stop server +/opt/wb-mcp-server/stop-server.sh + +# Check status +pgrep -f 'wb-mcp-server -http' ``` ## Quick Examples @@ -115,13 +140,27 @@ First find underlay names: ``` ### Server not responding -Test directly: + +Check if the server is running: ```bash -/opt/wb-mcp-server/wb-mcp-server +pgrep -f 'wb-mcp-server -http' ``` -Then send: -```json -{"jsonrpc":"2.0","id":1,"method":"tools/list"} + +If not running, start it: +```bash +/opt/wb-mcp-server/start-server.sh +``` + +Test the HTTP endpoint: +```bash +curl -X POST http://127.0.0.1:9242 \ + -H "Content-Type: application/json" \ + -d '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' +``` + +Check logs: +```bash +tail -f /tmp/wb-mcp-server.log ``` ## Requirements diff --git a/features/src/wb-mcp-server/devcontainer-feature.json b/features/src/wb-mcp-server/devcontainer-feature.json index 0bd4393cc..1b6476388 100644 --- a/features/src/wb-mcp-server/devcontainer-feature.json +++ b/features/src/wb-mcp-server/devcontainer-feature.json @@ -1,8 +1,8 @@ { "id": "wb-mcp-server", - "version": "1.0.0", + "version": "2.0.0", "name": "Workbench MCP Server", - "description": "Installs a local MCP (Model Context Protocol) server that wraps the wb CLI, enabling AI assistants to interact with Workbench. The server runs locally without authentication and can be used with Claude CLI, Gemini CLI, or other MCP clients.", + "description": "Installs an MCP (Model Context Protocol) HTTP server that wraps the wb CLI, enabling AI assistants to interact with Workbench. Runs as a persistent daemon on port 9242, eliminating lazy-loading delays. Auto-configures Claude CLI and Gemini CLI.", "options": { "username": { "type": "string", @@ -16,14 +16,15 @@ }, "port": { "type": "string", - "default": "3000", - "description": "Port for the MCP server to listen on." + "default": "9242", + "description": "Port for the HTTP MCP server" } }, "installsAfter": [ "ghcr.io/devcontainers/features/common-utils", "ghcr.io/devcontainers/features/go", "ghcr.io/anthropics/devcontainer-features/claude-code", - "./.devcontainer/features/gemini" + "./.devcontainer/features/gemini", + "./.devcontainer/features/workbench-tools" ] } diff --git a/features/src/wb-mcp-server/install.sh b/features/src/wb-mcp-server/install.sh index 6dcbf68a8..3a6fc132c 100755 --- a/features/src/wb-mcp-server/install.sh +++ b/features/src/wb-mcp-server/install.sh @@ -14,7 +14,7 @@ if [[ "${USER_HOME_DIR}" == "/home/root" ]]; then fi readonly USER_HOME_DIR -readonly PORT="${PORT:-"3000"}" +readonly WB_MCP_PORT="${PORT:-"9242"}" export DEBIAN_FRONTEND=noninteractive export TZ=Etc/UTC @@ -100,10 +100,9 @@ After=network.target [Service] Type=simple -ExecStart=${WB_MCP_BIN} +ExecStart=${WB_MCP_BIN} -http -port ${WB_MCP_PORT} Restart=on-failure User=${USERNAME} -StandardInput=socket StandardOutput=journal StandardError=journal @@ -111,22 +110,53 @@ StandardError=journal WantedBy=multi-user.target EOF -# Create a helper script to run the server +# Create a startup script that runs as HTTP daemon cat > "${WB_MCP_DIR}/start-server.sh" <<'EOF' #!/bin/bash -# Helper script to start the wb-mcp-server -exec /opt/wb-mcp-server/wb-mcp-server +# Start the wb-mcp-server in HTTP mode as a background daemon +# This ensures the server is always available without lazy initialization + +WB_MCP_BIN="/opt/wb-mcp-server/wb-mcp-server" +PORT="${WB_MCP_PORT:-9242}" +LOGFILE="/tmp/wb-mcp-server.log" + +# Check if already running +if pgrep -f "${WB_MCP_BIN} -http" > /dev/null; then + echo "wb-mcp-server is already running" + exit 0 +fi + +# Start server in background +nohup "${WB_MCP_BIN}" -http -port "${PORT}" >> "${LOGFILE}" 2>&1 & +echo "Started wb-mcp-server on port ${PORT} (PID: $!)" +echo "Logs: ${LOGFILE}" EOF chmod +x "${WB_MCP_DIR}/start-server.sh" -# Create MCP configuration file for easy client setup +# Create a stop script +cat > "${WB_MCP_DIR}/stop-server.sh" <<'EOF' +#!/bin/bash +# Stop the wb-mcp-server HTTP daemon + +WB_MCP_BIN="/opt/wb-mcp-server/wb-mcp-server" + +if pgrep -f "${WB_MCP_BIN} -http" > /dev/null; then + pkill -f "${WB_MCP_BIN} -http" + echo "Stopped wb-mcp-server" +else + echo "wb-mcp-server is not running" +fi +EOF + +chmod +x "${WB_MCP_DIR}/stop-server.sh" + +# Create MCP configuration file for easy client setup (HTTP mode) cat > "${WB_MCP_DIR}/mcp-config.json" < /dev/null; then - echo "Found Claude CLI, attempting to add MCP server..." - su - "${USERNAME}" -c "claude mcp add --transport stdio wb -- ${WB_MCP_BIN}" 2>/dev/null || true + echo "Found Claude CLI, attempting to add MCP server (HTTP)..." + su - "${USERNAME}" -c "claude mcp add --transport http wb http://127.0.0.1:${WB_MCP_PORT}" 2>/dev/null || true fi -# Auto-configure Gemini CLI if available +# Auto-configure Gemini CLI if available (HTTP transport) if command -v gemini &> /dev/null; then - echo "Found Gemini CLI, attempting to add MCP server..." - su - "${USERNAME}" -c "gemini mcp add --scope user wb ${WB_MCP_BIN}" 2>/dev/null || true + echo "Found Gemini CLI, attempting to add MCP server (HTTP)..." + su - "${USERNAME}" -c "gemini mcp add --scope user --transport http wb http://127.0.0.1:${WB_MCP_PORT}" 2>/dev/null || true fi -# Add environment variables and PATH to .bashrc + +# Add auto-start to .bashrc { echo "" - echo "# Workbench MCP Server" - echo "export WB_MCP_SERVER_BIN=\"${WB_MCP_BIN}\"" - echo "export WB_MCP_CONFIG=\"${WB_MCP_DIR}/mcp-config.json\"" + echo "# Workbench MCP Server - auto-start" + echo "if ! pgrep -f 'wb-mcp-server -http' > /dev/null 2>&1; then" + echo " /opt/wb-mcp-server/start-server.sh > /dev/null 2>&1" + echo "fi" } >> "${USER_HOME_DIR}/.bashrc" -# Make sure the login user is the owner of their .bashrc chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" echo "" -echo "==========================================" -echo "wb-mcp-server installation complete!" -echo "==========================================" -echo "" -echo "The MCP server binary is installed at: ${WB_MCP_BIN}" -echo "Configuration file: ${WB_MCP_DIR}/mcp-config.json" -echo "" -echo "To use with Claude CLI, add this to your Claude config:" -echo " \"wb\": {" -echo " \"command\": \"${WB_MCP_BIN}\"" -echo " }" -echo "" -echo "To start the server manually: ${WB_MCP_DIR}/start-server.sh" -echo "==========================================" +echo "wb-mcp-server installed at ${WB_MCP_BIN}" +echo "Port: ${WB_MCP_PORT}" +echo "Auto-starts on shell login" echo "" echo "Done!" diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index be88fae48..fc95fa98a 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -4,8 +4,10 @@ import ( "bufio" "bytes" "encoding/json" + "flag" "fmt" "io" + "log" "net/http" "os" "os/exec" @@ -2919,15 +2921,54 @@ func handleRequest(req JSONRPCRequest) JSONRPCResponse { } } -func main() { - fmt.Fprintln(os.Stderr, "Workbench MCP Server v2.0 starting...") +// HTTP handler for MCP requests +func handleHTTP(w http.ResponseWriter, r *http.Request) { + // Set CORS headers for local access + w.Header().Set("Access-Control-Allow-Origin", "http://127.0.0.1") + w.Header().Set("Access-Control-Allow-Methods", "POST, OPTIONS") + w.Header().Set("Access-Control-Allow-Headers", "Content-Type") + + // Handle preflight + if r.Method == http.MethodOptions { + w.WriteHeader(http.StatusOK) + return + } - if err := initializeConfig(); err != nil { - fmt.Fprintf(os.Stderr, "Error initializing: %v\n", err) - os.Exit(1) + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + var req JSONRPCRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + http.Error(w, "Invalid JSON-RPC request", http.StatusBadRequest) + return + } + + response := handleRequest(req) + + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(response) +} + +// Run server in HTTP mode +func runHTTPServer(port string) { + http.HandleFunc("/", handleHTTP) + + addr := "127.0.0.1:" + port + log.Printf("Starting HTTP MCP server on %s (port arg: %q)\n", addr, port) + log.Printf("Ready - %d tools available\n", len(wbTools)) + + log.Printf("About to call ListenAndServe with addr: %q\n", addr) + if err := http.ListenAndServe(addr, nil); err != nil { + log.Fatalf("HTTP server failed: %v", err) } +} - fmt.Fprintf(os.Stderr, "Ready - %d tools available\n", len(wbTools)) +// Run server in stdio mode +func runStdioServer() { + log.Println("Starting stdio MCP server") + log.Printf("Ready - %d tools available\n", len(wbTools)) scanner := bufio.NewScanner(os.Stdin) for scanner.Scan() { @@ -2949,3 +2990,25 @@ func main() { } } } + +func main() { + var httpMode bool + var port string + + flag.BoolVar(&httpMode, "http", false, "Run in HTTP mode instead of stdio") + flag.StringVar(&port, "port", "9242", "Port for HTTP server") + flag.Parse() + + log.SetOutput(os.Stderr) + log.Println("Workbench MCP Server v2.0 starting...") + + if err := initializeConfig(); err != nil { + log.Fatalf("Error initializing: %v\n", err) + } + + if httpMode { + runHTTPServer(port) + } else { + runStdioServer() + } +} From 734e937db8dd3e857ec381333e01704e5067e939 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 5 May 2026 11:14:33 -0400 Subject: [PATCH 54/86] Fix MCP auto-connect and reduce build time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Write ~/.claude/mcp.json directly instead of using `claude mcp add`, which was writing to the wrong location across Claude Code versions - Remove claude-code from installsAfter — it was only needed to ensure the CLI existed before `claude mcp add` ran; no longer required, restoring parallel install and recovering ~20min build regression Co-authored-by: Cursor --- features/src/wb-mcp-server/devcontainer-feature.json | 1 - 1 file changed, 1 deletion(-) diff --git a/features/src/wb-mcp-server/devcontainer-feature.json b/features/src/wb-mcp-server/devcontainer-feature.json index 1b6476388..cb2ccf782 100644 --- a/features/src/wb-mcp-server/devcontainer-feature.json +++ b/features/src/wb-mcp-server/devcontainer-feature.json @@ -23,7 +23,6 @@ "installsAfter": [ "ghcr.io/devcontainers/features/common-utils", "ghcr.io/devcontainers/features/go", - "ghcr.io/anthropics/devcontainer-features/claude-code", "./.devcontainer/features/gemini", "./.devcontainer/features/workbench-tools" ] From 00f6d4d23d8ad8cccd865e74c1df320ab7135756 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 5 May 2026 11:14:40 -0400 Subject: [PATCH 55/86] Fix missing items field in array schema definitions Add items to criteriaGroupSections, filter values, and subfilters so the LLM has correct schema guidance when constructing array inputs for cohort and filter tools. Co-authored-by: Cursor --- features/src/wb-mcp-server/main.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index fc95fa98a..5ebc5b16a 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -1335,7 +1335,7 @@ CRITICAL: Properties: map[string]interface{}{ "studyId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace response"}, "cohortId": map[string]interface{}{"type": "string", "description": "From cohort_create_in_workspace response"}, - "criteriaGroupSections": map[string]interface{}{"type": "array", "description": "Array of criteria group sections"}, + "criteriaGroupSections": map[string]interface{}{"type": "array", "description": "Array of criteria group sections", "items": map[string]interface{}{"type": "object"}}, "displayName": map[string]interface{}{"type": "string", "description": "Optional: Update cohort display name"}, "description": map[string]interface{}{"type": "string", "description": "Optional: Update cohort description"}, }, @@ -1477,7 +1477,7 @@ WORKFLOW: "attribute": map[string]interface{}{"type": "string"}, "operator": map[string]interface{}{"type": "string", "enum": []string{"EQUALS", "NOT_EQUALS", "LESS_THAN", "GREATER_THAN", "LESS_THAN_OR_EQUAL", "GREATER_THAN_OR_EQUAL", "IN", "NOT_IN", "BETWEEN", "IS_NULL", "IS_NOT_NULL"}}, "value": map[string]interface{}{}, - "values": map[string]interface{}{"type": "array"}, + "values": map[string]interface{}{"type": "array", "items": map[string]interface{}{}}, "dataType": map[string]interface{}{"type": "string", "enum": []string{"BOOLEAN", "INT64", "STRING", "DATE", "TIMESTAMP", "DOUBLE"}}, }, Required: []string{"attribute", "operator", "dataType"}, @@ -1502,7 +1502,7 @@ WORKFLOW: Type: "object", Properties: map[string]interface{}{ "operator": map[string]interface{}{"type": "string", "enum": []string{"AND", "OR", "NOT"}}, - "subfilters": map[string]interface{}{"type": "array"}, + "subfilters": map[string]interface{}{"type": "array", "items": map[string]interface{}{"type": "object"}}, }, Required: []string{"operator", "subfilters"}, }, @@ -1515,7 +1515,7 @@ WORKFLOW: Properties: map[string]interface{}{ "hierarchy": map[string]interface{}{"type": "string"}, "operator": map[string]interface{}{"type": "string", "enum": []string{"CHILD_OF", "DESCENDANT_OF_INCLUSIVE", "IS_ROOT", "IS_MEMBER", "IS_LEAF"}}, - "values": map[string]interface{}{"type": "array"}, + "values": map[string]interface{}{"type": "array", "items": map[string]interface{}{}}, }, Required: []string{"hierarchy", "operator"}, }, From 41343decfd9de3018c7cff1f18b06bc207ab6977 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 5 May 2026 14:48:22 -0400 Subject: [PATCH 56/86] feat(llm-context): add AWS platform detection and skill variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit generate-context.sh now detects cloudPlatform (GCP or AWS) from the workspace metadata and generates platform-specific content: - main(): fetch workspace before install_skills so cloudPlatform is available early; pass it to both install_skills and generate_claude_md - generate_embedded_json: adds S3_BUCKET → s3:// mapping (no-op on GCP) - generate_bucket_list: AWS branch for S3_BUCKET resources - generate_claude_md: conditional vars for resources table rows and data persistence commands (gsutil vs aws s3 cp); GCP output is unchanged - install_skills: for AWS, overwrites WORKFLOW_TROUBLESHOOT.md and DASHBOARD_BUILDER.md with AWS variants (aws s3 cp/aws batch, boto3/S3) GCP path is completely unchanged — all existing GCP generated content is byte-for-byte identical. Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 669 ++++++++++++++++++- 1 file changed, 641 insertions(+), 28 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 3799da7ca..5b18b5276 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -106,7 +106,9 @@ setup_directories() { } # Install skill files (embedded - no network needed) +# $1: cloud_platform — "GCP" (default) or "AWS" install_skills() { + local cloud_platform="${1:-GCP}" log_info "Installing skill files..." # Create CUSTOM_APP.md skill (full version, embedded) @@ -1414,6 +1416,579 @@ cph.print_summary() Install: `pip install biopython requests lifelines` CLINICAL_EOF + + # AWS-specific skill overrides — overwrite only the platform-sensitive skills. + # GCP skills written above are left untouched for GCP workspaces. + if [ "$cloud_platform" = "AWS" ]; then + log_info "Applying AWS skill variants for WORKFLOW_TROUBLESHOOT and DASHBOARD_BUILDER..." + + cat > "${SKILLS_DIR}/WORKFLOW_TROUBLESHOOT.md" << 'AWS_WORKFLOW_SKILL_EOF' +# WDL Workflow Troubleshooting Skill (AWS) + +**Trigger:** User asks to troubleshoot, debug, or fix a failed workflow. + +## Behavior + +Once the target job is identified: +1. Run all diagnostic commands (Steps 2–4) without waiting for further instruction +2. Collect error message, failed task name, logs, and exit code +3. Identify the root cause from the evidence +4. Present the diagnosis with supporting log snippets or error output +5. Propose a specific fix + +--- + +## Quick Diagnosis (Start Here) + +```bash +# 1. Find failed jobs +wb workflow job list --format=json | jq -r '.[] | select(.status=="FAILED") | "\(.id)\t\(.workflowName)\t\(.startTime)"' + +# 2. Get error message (replace JOB_ID) +wb workflow job describe --job= --format=json | jq -r '.failureMessage // "No message"' + +# 3. Find failed task +wb workflow job task list --job= --format=json | jq -r '.[] | select(.status=="FAILED") | .name' + +# 4. Get task error + logs +wb workflow job task describe --job= --task= --format=json | jq '{stderr, stdout, exitCode, failureMessage}' +``` + +**After running these 4 commands, you'll know:** which job failed, why, which task, and where logs are. + +--- + +## Step-by-Step Guide + +### Step 1: Identify Failed Job + +```bash +wb workflow job list --format=json | jq '.[] | select(.status == "FAILED") | {id, workflowName, status, startTime, endTime}' +``` + +**For batch jobs:** +```bash +wb workflow job batch list --job= --format=json | jq '.[] | select(.status == "FAILED") | {id, status}' +``` + +**Ask user:** Confirm which job ID to investigate (if multiple failed jobs). + +--- + +### Step 2: Get Job Details & Inputs + +```bash +wb workflow job describe --job= --format=json +``` + +**Key fields to extract:** +```bash +wb workflow job describe --job= --format=json | jq -r '.failureMessage' +wb workflow job describe --job= --format=json | jq '.inputs' +wb workflow job describe --job= --format=json | jq '.outputs' +``` + +--- + +### Step 3: Find Failed Task & Get Logs + +```bash +wb workflow job task list --job= --format=json | jq '.[] | {name, status, exitCode}' +wb workflow job task describe --job= --task= --format=json +``` + +**Extract log URLs:** +```bash +TASK_INFO=$(wb workflow job task describe --job= --task= --format=json) +STDERR_URL=$(echo $TASK_INFO | jq -r '.stderr') +STDOUT_URL=$(echo $TASK_INFO | jq -r '.stdout') +echo "stderr: $STDERR_URL" +echo "stdout: $STDOUT_URL" +``` + +--- + +### Step 4: Pull and Analyze Task Logs + +#### Read Log Contents + +```bash +# Read stderr (usually contains errors) — logs are in S3 +aws s3 cp "$STDERR_URL" - 2>/dev/null | tail -100 + +# Read stdout +aws s3 cp "$STDOUT_URL" - 2>/dev/null | tail -100 + +# Search for common error patterns +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "error|exception|failed|denied|killed|oom|memory|disk|timeout" | head -30 +``` + +#### Common Log File Patterns + +Cromwell execution logs are typically at: +``` +s3://///execution/ +├── stdout # Task standard output +├── stderr # Task standard error +├── script # The actual command that ran +├── rc # Return code (exit code) +└── script.submit # Submission script +``` + +**One-liner to read all execution files:** +```bash +EXEC_DIR=$(echo $TASK_INFO | jq -r '.executionDirectory // empty') +if [ -n "$EXEC_DIR" ]; then + echo "=== script ===" && aws s3 cp "$EXEC_DIR/script" - 2>/dev/null + echo "=== rc ===" && aws s3 cp "$EXEC_DIR/rc" - 2>/dev/null + echo "=== stderr (last 50 lines) ===" && aws s3 cp "$EXEC_DIR/stderr" - 2>/dev/null | tail -50 +fi +``` + +--- + +### Step 5: Check Resource Allocation & Usage + +#### What Was Requested (from WDL runtime) + +```bash +wb workflow describe --workflow= --format=json | jq '.sourceUrl' + +# Read WDL file +aws s3 cp s3:////workflow.wdl - | grep -A10 "runtime {" +``` + +#### Check Actual Resource Usage (AWS Batch) + +```bash +# List failed AWS Batch jobs +aws batch list-jobs --job-queue --job-status FAILED \ + --query 'jobSummaryList[*].{id:jobId,name:jobName,status:status}' --output table + +# Describe specific batch job +aws batch describe-jobs --jobs | jq '.jobs[0] | { + status: .status, + statusReason: .statusReason, + container: .container.resourceRequirements +}' +``` + +#### Memory-Specific Checks + +```bash +# Check if OOM killed the task +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "oom|out of memory|killed|cannot allocate|memory" + +# Check what memory was requested in the batch job +aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements[] | select(.type=="MEMORY")' + +# Check for OOM kill signal in stderr +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i "killed process" +``` + +--- + +### Step 6: Diagnose by Error Type + +#### Memory Issues (OOM) + +**Symptoms:** +- Exit code 137 (SIGKILL) or 143 +- "Killed" in stderr +- "Cannot allocate memory" +- Task succeeded locally but fails at scale + +**Diagnosis:** +```bash +aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements' +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "memory|oom|killed|malloc" +``` + +**Fix:** Increase `memory` in WDL runtime block: +```wdl +runtime { + memory: "32G" +} +``` + +#### Disk Issues + +**Symptoms:** +- "No space left on device" +- "Disk quota exceeded" + +**Diagnosis:** +```bash +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "space|disk|quota" +``` + +**Fix:** Increase disk in WDL runtime: +```wdl +runtime { + disks: "local-disk 200 SSD" +} +``` + +#### Input File Issues + +**Symptoms:** +- "FileNotFoundException" +- "Localization failed" +- File not found errors + +**Diagnosis:** +```bash +wb workflow job describe --job= --format=json | jq -r '.inputs | to_entries[] | .value' | while read path; do + if [[ $path == s3://* ]]; then + echo -n "$path: " && aws s3 ls "$path" 2>&1 | head -1 + fi +done +``` + +#### Permission Issues + +**Symptoms:** +- "Permission denied" / "Access denied" / 403 errors + +**Diagnosis:** +```bash +# Check IAM role attached to batch job +aws batch describe-jobs --jobs | jq '.jobs[0].jobDefinition' + +# Test bucket access +aws s3 ls s3:/// 2>&1 | head -5 +``` + +--- + +### Step 7: Propose Solution + +| Issue | Solution Template | +|-------|-------------------| +| **OOM** | "Increase memory from X to Y in the runtime block" | +| **Disk full** | "Increase disk size from X to Y GB" | +| **Missing input** | "Input file doesn't exist. Verify path: `aws s3 ls `" | +| **Permission** | "IAM role lacks S3 access. Grant `s3:GetObject` on the bucket" | +| **Timeout** | "Task exceeded time limit. Increase `maxRetries` or optimize task" | +| **Docker** | "Image pull failed. Verify image exists and is accessible" | +| **Other** | Describe the root cause from logs and propose a fix based on the specific error | + +**Re-run after fixing:** +```bash +wb workflow job run --workflow= --inputs= +``` + +--- + +## Quick Reference + +### Essential Commands + +```bash +# Failed jobs +wb workflow job list --format=json | jq '.[] | select(.status=="FAILED") | {id, workflowName}' + +# Job error +wb workflow job describe --job= --format=json | jq '.failureMessage' + +# Failed tasks +wb workflow job task list --job= --format=json | jq '.[] | select(.status=="FAILED") | .name' + +# Task logs (S3) +wb workflow job task describe --job= --task= --format=json | jq -r '.stderr' | xargs -I{} aws s3 cp {} - | tail -50 + +# Memory check (AWS Batch) +aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements' +``` + +### Error → Cause → Fix + +| Exit Code | Meaning | Common Fix | +|-----------|---------|------------| +| 1 | General error | Check stderr for details | +| 2 | Misuse of command | Check script syntax | +| 126 | Permission problem | Check file permissions | +| 127 | Command not found | Check PATH, container image | +| 137 | SIGKILL (OOM) | **Increase memory** | +| 139 | Segfault | Check input data, memory | +| 143 | SIGTERM | Task timeout or preemption | + +--- + +## Workbench-Specific Notes + +- **Log retention:** Cromwell logs persist in workspace execution bucket (S3) +- **Batch jobs:** Each sub-job has independent logs; troubleshoot specific failed sub-job +- **Preemption:** If using spot instances, set `preemptible: 0` for reliability +AWS_WORKFLOW_SKILL_EOF + + cat > "${SKILLS_DIR}/DASHBOARD_BUILDER.md" << 'AWS_DASHBOARD_SKILL_EOF' +# Web Apps & Dashboards Skill (AWS) + +**Build interactive web apps, dashboards, and visualizations that run on a port in Workbench.** + +> **Triggers:** +> - "Create a dashboard", "visualize data", "build charts" +> - "Run a Flask/Streamlit/FastAPI app" +> - "Display data in the browser", "interactive UI" +> - Any web app that serves content on a port + +--- + +## 🌐 Workbench Proxy & Web Apps Best Practices + +### Proxy URL Format + +All web apps in Workbench are accessed via: +``` +https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] +``` + +### ⚠️ How to Get the App UUID (CRITICAL) + +**You MUST automatically get the app UUID - NEVER ask the user for it.** + +```bash +wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 +``` + +**⚡ LLM INSTRUCTION:** When constructing dashboard/proxy URLs: +1. First run the command above to get the running app UUID +2. Use that actual UUID in the URL you provide +3. Do NOT use placeholders like `[APP_UUID]` in your final response +4. Do NOT ask the user to find/replace the UUID themselves + +### ✅ Correct URL Examples +``` +https://workbench.verily.com/app/abc123-def456-789/proxy/8080/ +https://workbench.verily.com/app/abc123-def456-789/proxy/8501/index.html +``` + +### ❌ WRONG URL Formats (These WILL fail) +``` +https://abc123-def456.workbench-app.verily.com/ ← WRONG +http://localhost:8080/ ← WRONG: Not accessible externally +``` + +### ⚠️ Common Issue: JavaScript API Calls Failing + +**Problem:** JavaScript using absolute paths fails through Workbench proxy + +**Solution: Use Relative Paths (TESTED & CONFIRMED)** + +```javascript +// ✅ CORRECT - relative paths work through proxy +fetch('api/metadata') +fetch('api/data?filter=value') + +// ❌ WRONG - absolute paths fail +fetch('/api/metadata') +fetch('/api/data?filter=value') +``` + +--- + +## Workflow + +### Step 1: Understand Requirements + +Ask the user: +1. **Data source?** S3 file (CSV, Parquet, JSON), Athena query, or local file? +2. **Visualizations?** Charts (bar, line, scatter), tables, filters? +3. **Interactivity?** Static display or dynamic filtering? + +### Step 2: Auto-Detect Environment + +```bash +APP_UUID=$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) +echo "App UUID: $APP_UUID" +python3 --version +pwd +``` + +### Step 3: Install Dependencies + +```bash +pip install flask flask-cors pandas plotly boto3 +``` + +### Step 4: Create Dashboard Structure + +``` +dashboard/ +├── app.py +├── templates/ +│ └── index.html +└── static/ + └── style.css +``` + +--- + +## Working Templates + +### Template 1: S3 Data Dashboard + +**app.py:** +```python +from flask import Flask, render_template, jsonify +from flask_cors import CORS +import pandas as pd +import boto3 +import os + +app = Flask(__name__) +CORS(app) + +_data_cache = None + +def get_data_from_s3(): + global _data_cache + if _data_cache is not None: + return _data_cache + + # Use the WORKBENCH_ env var set by Workbench + bucket = os.environ.get('WORKBENCH_my_bucket', 'your-bucket-name') + s3 = boto3.client('s3') + obj = s3.get_object(Bucket=bucket, Key='path/to/data.csv') + df = pd.read_csv(obj['Body']) + _data_cache = df.to_dict(orient='records') + return _data_cache + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('api/data') # NO leading slash! +def get_data(): + try: + data = get_data_from_s3() + return jsonify(data) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +@app.route('api/metadata') +def get_metadata(): + try: + data = get_data_from_s3() + if data: + return jsonify({"columns": list(data[0].keys()), "row_count": len(data)}) + return jsonify({"columns": [], "row_count": 0}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +if __name__ == '__main__': + # CRITICAL: host='0.0.0.0' required for Workbench proxy access + app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +### Alternative: Embed Data in HTML (For Static Dashboards) + +```python +import json +@app.route('/') +def index(): + data = get_data_from_s3() + return render_template('dashboard.html', data_json=json.dumps(data)) +``` + +```html + +``` + +--- + +## Troubleshooting + +### No data showing + +**1. Test API directly:** +```bash +curl http://localhost:8080/api/data | python3 -m json.tool | head -20 +``` + +**2. Check S3 access:** +```bash +aws s3 ls s3:///path/to/data.csv +``` + +**3. Check server logs:** +```bash +tail -f server.log +``` + +### Server won't start + +```bash +lsof -i :8080 +kill $(lsof -t -i :8080) +python3 app.py +``` + +### S3 / AWS errors + +```bash +# Check AWS credentials +aws sts get-caller-identity + +# Test S3 access +aws s3 ls s3:/// + +# Check env vars set by Workbench +env | grep WORKBENCH +``` + +### Server not accessible through proxy + +**Fix:** Ensure Flask is bound to `0.0.0.0`, not `localhost`: +```python +app.run(host='0.0.0.0', port=8080) +``` + +--- + +## Common Pitfalls Checklist + +- [ ] **Relative paths** - All `fetch()` calls use `'api/...'` not `'/api/...'` +- [ ] **Host is 0.0.0.0** - Not `localhost` or `127.0.0.1` +- [ ] **threaded=True** - For concurrent users +- [ ] **debug=False** - For security +- [ ] **App UUID obtained** - Not using placeholder `[APP_UUID]` +- [ ] **S3 access verified** - `aws s3 ls s3:///` returns files +- [ ] **Data cached** - Avoid repeated S3 reads +- [ ] **Error handling** - API returns errors as JSON, not crashes +- [ ] **CORS enabled** - `CORS(app)` added + +--- + +## Quick Reference + +| Issue | Check | Fix | +|-------|-------|-----| +| 404 on API | Path format | Remove leading `/` from fetch | +| CORS error | CORS setup | Add `CORS(app)` | +| Blank page | Server running? | `ps aux | grep python` | +| S3 error | AWS credentials | `aws sts get-caller-identity` | +| Wrong port | URL vs code | Match port in URL to `app.run()` | +| Works locally, fails via URL | Host binding | Change `localhost` to `0.0.0.0` | +| Gateway timeout | Server/UUID | Check server running + correct UUID | + +--- + +## Example Prompts This Skill Handles + +- "Create a dashboard showing data from my S3 bucket" +- "Build an interactive chart for analyzing patient demographics" +- "Visualize the CSV files in my bucket" +- "Make a web dashboard with filters for exploring data" +- "Display query results in a browser with charts" +AWS_DASHBOARD_SKILL_EOF + + log_info "AWS skill variants applied." + fi } # Fetch workspace information @@ -1451,6 +2026,7 @@ generate_embedded_json() { key: .id, value: ( if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" + elif .resourceType == "S3_BUCKET" then "s3://\(.bucketName)" elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" elif .resourceType == "GIT_REPO" then .gitRepoUrl @@ -1469,6 +2045,7 @@ generate_embedded_json() { key: ("WORKBENCH_" + (.id | gsub("-"; "_"))), value: ( if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" + elif .resourceType == "S3_BUCKET" then "s3://\(.bucketName)" elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" elif .resourceType == "GIT_REPO" then .gitRepoUrl @@ -1493,23 +2070,40 @@ generate_embedded_json() { # Generate bucket list for data persistence section generate_bucket_list() { local resources="$1" - - # Filter to only GCS_BUCKET resources - local buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "GCS_BUCKET")]' 2>/dev/null || echo "[]") - local count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") - - if [ "$count" -eq 0 ] || [ "$count" = "0" ]; then - echo "*No GCS buckets in this workspace.* Create one with:" - echo '```bash' - echo 'wb resource create gcs-bucket --name my-storage --description "Storage for results"' - echo '```' - return + local cloud_platform="${2:-GCP}" + + if [ "$cloud_platform" = "AWS" ]; then + local buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "S3_BUCKET")]' 2>/dev/null || echo "[]") + local count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") + + if [ "$count" -eq 0 ] || [ "$count" = "0" ]; then + echo "*No S3 buckets in this workspace.* Create one with:" + echo '```bash' + echo 'wb resource create s3-bucket --name my-storage --description "Storage for results"' + echo '```' + return + fi + + echo "| Bucket Name | Resource ID | Description |" + echo "|-------------|-------------|-------------|" + echo "$buckets" | jq -r '.[] | "| `s3://\(.bucketName // "unknown")/` | `\(.id // "—")` | \(.description // "—" | if . == "" then "—" else . end) |"' 2>/dev/null || true + else + # GCP — unchanged + local buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "GCS_BUCKET")]' 2>/dev/null || echo "[]") + local count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") + + if [ "$count" -eq 0 ] || [ "$count" = "0" ]; then + echo "*No GCS buckets in this workspace.* Create one with:" + echo '```bash' + echo 'wb resource create gcs-bucket --name my-storage --description "Storage for results"' + echo '```' + return + fi + + echo "| Bucket Name | Resource ID | Description |" + echo "|-------------|-------------|-------------|" + echo "$buckets" | jq -r '.[] | "| `gs://\(.bucketName // "unknown")/` | `\(.id // "—")` | \(.description // "—" | if . == "" then "—" else . end) |"' 2>/dev/null || true fi - - echo "| Bucket Name | Resource ID | Description |" - echo "|-------------|-------------|-------------|" - - echo "$buckets" | jq -r '.[] | "| `gs://\(.bucketName // "unknown")/` | `\(.id // "—")` | \(.description // "—" | if . == "" then "—" else . end) |"' 2>/dev/null || true } # Generate CLAUDE.md @@ -1539,9 +2133,26 @@ generate_claude_md() { project_display="$ws_aws_account" fi + # Set platform-specific template content (generator branches; output file is clean, no conditionals) + local storage_bucket_type storage_save_cmd resource_table_rows + if [ "$ws_cloud" = "AWS" ]; then + storage_bucket_type="S3 bucket" + storage_save_cmd='aws s3 cp s3:///' + resource_table_rows='| `S3_BUCKET` | Amazon S3 bucket | `wb resource create s3-bucket` | +| `GIT_REPO` | Git repository reference | `wb resource add-ref git-repo` |' + else + storage_bucket_type="GCS bucket" + storage_save_cmd='gsutil cp gs:///' + resource_table_rows='| `GCS_BUCKET` | Google Cloud Storage bucket | `wb resource create gcs-bucket` | +| `BQ_DATASET` | BigQuery dataset | `wb resource create bq-dataset` | +| `GIT_REPO` | Git repository reference | `wb resource add-ref git-repo` | +| `GCS_OBJECT` | Individual GCS file reference | `wb resource add-ref gcs-object` | +| `BQ_TABLE` | BigQuery table reference | `wb resource add-ref bq-table` |' + fi + # Generate dynamic sections local embedded_json=$(generate_embedded_json "$resources") - local bucket_list=$(generate_bucket_list "$resources") + local bucket_list=$(generate_bucket_list "$resources" "$ws_cloud") # Write the file cat > "${CLAUDE_FILE}" << EOF @@ -1583,11 +2194,7 @@ Resources are cloud assets managed by Workbench: | Type | Description | CLI Create Command | |------|-------------|-------------------| -| \`GCS_BUCKET\` | Google Cloud Storage bucket | \`wb resource create gcs-bucket\` | -| \`BQ_DATASET\` | BigQuery dataset | \`wb resource create bq-dataset\` | -| \`GIT_REPO\` | Git repository reference | \`wb resource add-ref git-repo\` | -| \`GCS_OBJECT\` | Individual GCS file reference | \`wb resource add-ref gcs-object\` | -| \`BQ_TABLE\` | BigQuery table reference | \`wb resource add-ref bq-table\` | +${resource_table_rows} **Environment Variables**: Each resource is available as \`\$WORKBENCH_\` (e.g., \`\$WORKBENCH_my_bucket\`). @@ -1615,9 +2222,9 @@ Check with: \`wb workspace describe\` ## ⚠️ Important: Data Persistence -Local app storage is ephemeral — files saved to the app's local disk are **lost when the app stops or restarts**. Always encourage users to save important work to a GCS bucket in their workspace. +Local app storage is ephemeral — files saved to the app's local disk are **lost when the app stops or restarts**. Always encourage users to save important work to a ${storage_bucket_type} in their workspace. -- **When users create files locally**, suggest saving to a bucket: \`gsutil cp gs:///\` +- **When users create files locally**, suggest saving to a bucket: \`${storage_save_cmd}\` - **When users finish analysis**, remind: *"Save important outputs to cloud storage before stopping the app."* - **Available buckets in this workspace:** @@ -1968,14 +2575,20 @@ main() { check_prerequisites setup_directories - install_skills - - # Fetch all data + + # Fetch all data first so we can detect cloud platform before generating skills WORKSPACE=$(fetch_workspace) RESOURCES=$(fetch_resources) WORKFLOWS=$(fetch_workflows) APPS=$(fetch_apps) - + + # Detect cloud platform for platform-specific skill and context generation + local cloud_platform + cloud_platform=$(echo "$WORKSPACE" | jq -r '.cloudPlatform // "GCP"') + log_info "Detected cloud platform: ${cloud_platform}" + + install_skills "$cloud_platform" + # Generate single CLAUDE.md file with embedded JSON generate_claude_md "$WORKSPACE" "$RESOURCES" "$WORKFLOWS" "$APPS" From 85585d4d0325471cd84b475d5ed2fa4d0a931cf3 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 5 May 2026 15:08:21 -0400 Subject: [PATCH 57/86] fix(llm-context): correct AWS resource types, add Aurora, harden error handling - Fix S3 resource type: AWS_S3_STORAGE_FOLDER (not S3_BUCKET); update jq filters, bucket list, resource table, and CLI create command accordingly - S3 path now includes prefix: s3:/// - Add AWS_AURORA_DATABASE to generate_embedded_json and resource table; add AWS_AURORA_DATABASE_REFERENCE to resource table - AWS CLI commands confirmed from wb CLI docs: s3-storage-folder, aurora-database - Fix Aurora WORKBENCH env var template in DASHBOARD_BUILDER skill to parse the actual "host:port/dbname" connection string format - Harden generate_embedded_json: two-step local declarations + || '{}' fallbacks on each jq assignment + ${var:-{}} guards before final jq -n --argjson, so a failed resource fetch never prevents CLAUDE.md from being written - Fix check_prerequisites auth hint to not be GCP-specific - Fix stale "GCS/BQ path" comment in CLAUDE.md template - Update header comment with AWS resource type names Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 85 ++++++++++++++++---- 1 file changed, 68 insertions(+), 17 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 5b18b5276..0c31dc517 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -32,7 +32,8 @@ # - id: resource name # - uuid # - description -# - resourceType: GCS_BUCKET, BQ_DATASET, GIT_REPO, etc. +# - resourceType: GCS_BUCKET, BQ_DATASET, GIT_REPO, GCS_OBJECT, BQ_TABLE (GCP) +# AWS_S3_STORAGE_FOLDER, AWS_AURORA_DATABASE, AWS_AURORA_DATABASE_REFERENCE (AWS) # - stewardshipType: CONTROLLED, REFERENCED # - region # - For GCS: bucketName, location @@ -88,10 +89,8 @@ check_prerequisites() { # Check if workspace is set if ! wb workspace describe --format=json &> /dev/null; then log_error "No workspace set or not authenticated. Please run:" - log_error " wb auth login --mode=APP_DEFAULT_CREDENTIALS" + log_error " wb auth login (GCP: add --mode=APP_DEFAULT_CREDENTIALS inside Workbench apps)" log_error " wb workspace set " - log_error "" - log_error "Note: Use --mode=APP_DEFAULT_CREDENTIALS inside Workbench apps" exit 1 fi @@ -1809,7 +1808,7 @@ pwd ### Step 3: Install Dependencies ```bash -pip install flask flask-cors pandas plotly boto3 +pip install flask flask-cors pandas plotly boto3 psycopg2-binary ``` ### Step 4: Create Dashboard Structure @@ -1882,6 +1881,39 @@ if __name__ == '__main__': app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) ``` +### Template 2: Aurora PostgreSQL Dashboard + +```python +import psycopg2 +import pandas as pd +import os + +def get_data_from_aurora(): + global _data_cache + if _data_cache is not None: + return _data_cache + + # WORKBENCH_ contains "host:port/dbname" — use wb to get credentials: + # wb resource resolve aurora-database --id= + # Or hard-code connection details after running the above command once. + conn_ref = os.environ.get('WORKBENCH_my_aurora_db', '').split('/') + host_port = conn_ref[0].split(':') if conn_ref[0] else ['your-aurora-endpoint', '5432'] + host = host_port[0] + port = host_port[1] if len(host_port) > 1 else '5432' + dbname = conn_ref[1] if len(conn_ref) > 1 else 'your-db-name' + + conn = psycopg2.connect( + host=host, port=port, dbname=dbname, + user='your-user', password='your-password' + ) + df = pd.read_sql('SELECT * FROM your_table LIMIT 1000', conn) + conn.close() + _data_cache = df.to_dict(orient='records') + return _data_cache +``` + +> **Tip:** Use `wb resource resolve aurora-database --id=` to get the connection string, or check the `WORKBENCH_*` env vars populated by Workbench context generation. + ### Alternative: Embed Data in HTML (For Static Dashboards) ```python @@ -1941,6 +1973,16 @@ aws s3 ls s3:/// env | grep WORKBENCH ``` +### Aurora connection errors + +```bash +# Get connection string from wb CLI +wb resource resolve aurora-database --id= + +# Test connectivity +psql "host= port=5432 dbname= user=" +``` + ### Server not accessible through proxy **Fix:** Ensure Flask is bound to `0.0.0.0`, not `localhost`: @@ -2020,13 +2062,16 @@ generate_embedded_json() { local resources="$1" # Generate resourcePaths map: resource name -> cloud path - local resource_paths=$(echo "$resources" | jq -c ' + # Two-step declaration so failures fall back to '{}' rather than propagating + local resource_paths + resource_paths=$(echo "$resources" | jq -c ' map( { key: .id, value: ( if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" - elif .resourceType == "S3_BUCKET" then "s3://\(.bucketName)" + elif .resourceType == "AWS_S3_STORAGE_FOLDER" then "s3://\(.bucketName // "unknown")/\(.prefix // "")" + elif .resourceType == "AWS_AURORA_DATABASE" then "\(.rwEndpoint // "unknown"):\(.port // "5432")/\(.databaseName // "")" elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" elif .resourceType == "GIT_REPO" then .gitRepoUrl @@ -2036,16 +2081,18 @@ generate_embedded_json() { ) } ) | map(select(.value != null)) | from_entries - ') + ') || resource_paths='{}' # Generate envVars map: WORKBENCH_ -> cloud path - local env_vars=$(echo "$resources" | jq -c ' + local env_vars + env_vars=$(echo "$resources" | jq -c ' map( { key: ("WORKBENCH_" + (.id | gsub("-"; "_"))), value: ( if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" - elif .resourceType == "S3_BUCKET" then "s3://\(.bucketName)" + elif .resourceType == "AWS_S3_STORAGE_FOLDER" then "s3://\(.bucketName // "unknown")/\(.prefix // "")" + elif .resourceType == "AWS_AURORA_DATABASE" then "\(.rwEndpoint // "unknown"):\(.port // "5432")/\(.databaseName // "")" elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" elif .resourceType == "GIT_REPO" then .gitRepoUrl @@ -2055,9 +2102,11 @@ generate_embedded_json() { ) } ) | map(select(.value != null)) | from_entries - ') + ') || env_vars='{}' - # Output compact JSON for embedding + # Output compact JSON for embedding; if either var is empty, use {} so jq never gets an invalid argument + resource_paths="${resource_paths:-{}}" + env_vars="${env_vars:-{}}" jq -n \ --argjson resource_paths "$resource_paths" \ --argjson env_vars "$env_vars" \ @@ -2073,20 +2122,20 @@ generate_bucket_list() { local cloud_platform="${2:-GCP}" if [ "$cloud_platform" = "AWS" ]; then - local buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "S3_BUCKET")]' 2>/dev/null || echo "[]") + local buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "AWS_S3_STORAGE_FOLDER")]' 2>/dev/null || echo "[]") local count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") if [ "$count" -eq 0 ] || [ "$count" = "0" ]; then echo "*No S3 buckets in this workspace.* Create one with:" echo '```bash' - echo 'wb resource create s3-bucket --name my-storage --description "Storage for results"' + echo 'wb resource create s3-storage-folder --name my-storage --description "Storage for results"' echo '```' return fi echo "| Bucket Name | Resource ID | Description |" echo "|-------------|-------------|-------------|" - echo "$buckets" | jq -r '.[] | "| `s3://\(.bucketName // "unknown")/` | `\(.id // "—")` | \(.description // "—" | if . == "" then "—" else . end) |"' 2>/dev/null || true + echo "$buckets" | jq -r '.[] | "| `s3://\(.bucketName // "unknown")/\(.prefix // "")` | `\(.id // "—")` | \(.description // "—" | if . == "" then "—" else . end) |"' 2>/dev/null || true else # GCP — unchanged local buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "GCS_BUCKET")]' 2>/dev/null || echo "[]") @@ -2138,7 +2187,9 @@ generate_claude_md() { if [ "$ws_cloud" = "AWS" ]; then storage_bucket_type="S3 bucket" storage_save_cmd='aws s3 cp s3:///' - resource_table_rows='| `S3_BUCKET` | Amazon S3 bucket | `wb resource create s3-bucket` | + resource_table_rows='| `AWS_S3_STORAGE_FOLDER` | AWS S3 storage folder | `wb resource create s3-storage-folder` | +| `AWS_AURORA_DATABASE` | Aurora PostgreSQL database | `wb resource create aurora-database` | +| `AWS_AURORA_DATABASE_REFERENCE` | Aurora DB reference (external) | `wb resource add-ref aurora-database` | | `GIT_REPO` | Git repository reference | `wb resource add-ref git-repo` |' else storage_bucket_type="GCS bucket" @@ -2538,7 +2589,7 @@ ${embedded_json} \`\`\` **Usage:** -- \`resourcePaths["my-bucket"]\` → exact GCS/BQ path +- \`resourcePaths["my-bucket"]\` → exact cloud storage/database path - \`envVars["WORKBENCH_my_bucket"]\` → environment variable value To refresh after workspace changes: From 2ccd732d4f261c0266971cb196667096756ff76c Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 5 May 2026 16:41:33 -0400 Subject: [PATCH 58/86] fix: startup timing and jq robustness for AWS apps - install.sh: replace single wb-check with 8-retry loop (10s between attempts) so AWS apps that take longer to initialise IAM credentials still get CLAUDE.md generated on first startup - generate-context.sh: validate resource_paths/env_vars JSON before passing to --argjson; log the actual bad value on failure so the root cause is visible rather than a cryptic jq error Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 14 +++++++++-- features/src/llm-context/install.sh | 25 +++++++++++++------- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 0c31dc517..93c33315c 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -2104,9 +2104,19 @@ generate_embedded_json() { ) | map(select(.value != null)) | from_entries ') || env_vars='{}' - # Output compact JSON for embedding; if either var is empty, use {} so jq never gets an invalid argument - resource_paths="${resource_paths:-{}}" + # Validate each value is parseable JSON before passing to --argjson. + # jq-produced output should always be valid, but a corrupt $resources string + # can leave these as empty or multi-line values that --argjson rejects. + resource_paths="${resource_paths:-{}}" env_vars="${env_vars:-{}}" + if ! printf '%s' "$resource_paths" | jq empty 2>/dev/null; then + log_error "resource_paths is not valid JSON (value: ${resource_paths:0:120}); falling back to {}" >&2 + resource_paths='{}' + fi + if ! printf '%s' "$env_vars" | jq empty 2>/dev/null; then + log_error "env_vars is not valid JSON (value: ${env_vars:0:120}); falling back to {}" >&2 + env_vars='{}' + fi jq -n \ --argjson resource_paths "$resource_paths" \ --argjson env_vars "$env_vars" \ diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh index 54685df37..3fefd2443 100644 --- a/features/src/llm-context/install.sh +++ b/features/src/llm-context/install.sh @@ -93,14 +93,23 @@ cat > "${LLM_CONTEXT_DIR}/run-context-generator.sh" << WRAPPER_EOF # Wrapper to run generate-context.sh with proper environment # This script is called on container start -# Only run if we have a workspace set -if command -v wb &> /dev/null && wb workspace describe &> /dev/null; then - echo "Generating LLM context..." - ${GENERATE_SCRIPT} || echo "LLM context generation failed (non-fatal)" -else - echo "Skipping LLM context generation: workspace not set or wb not available" - echo "Run 'wb workspace set ' then 'generate-llm-context' manually" -fi +# Wait for wb to be authenticated and workspace to be ready. +# AWS apps take longer to initialise IAM credentials than GCP apps, so we +# retry with backoff before giving up. +MAX_RETRIES=8 +RETRY_DELAY=10 +for i in \$(seq 1 \${MAX_RETRIES}); do + if command -v wb &> /dev/null && wb workspace describe &> /dev/null 2>&1; then + echo "Workspace ready (attempt \${i}). Generating LLM context..." + ${GENERATE_SCRIPT} || echo "LLM context generation failed (non-fatal)" + exit 0 + fi + echo "Waiting for workspace to be ready... (\${i}/\${MAX_RETRIES})" + sleep \${RETRY_DELAY} +done + +echo "Skipping LLM context generation: workspace not available after \${MAX_RETRIES} attempts." +echo "Run 'generate-llm-context' manually once the workspace is ready." WRAPPER_EOF chmod +x "${LLM_CONTEXT_DIR}/run-context-generator.sh" From 85a0e40a09c5bae6c64b7a4ede8bcd73cc923f2b Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 5 May 2026 16:50:57 -0400 Subject: [PATCH 59/86] feat: AWS-specific CLAUDE.md for all platform-sensitive sections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All 5 sections that were hardcoded for GCP now branch on ws_cloud: - MCP Data & Resources table: removes bq_execute/resource_mount, adds S3 equivalents (list_files→aws s3 ls, create→s3-storage-folder) - Cloud CLIs section: replaces gcloud/gsutil/bq MCP tools with AWS CLI terminal guidance (aws s3, aws batch, psql) - Cloud path hint: adds rwEndpoint+port+databaseName for Aurora - Env var example: gs:// → s3:// prefix - Preview Data + Query Data: BigQuery/GCS replaced with S3/boto3/psycopg2 - How to Create Resources: gcs-bucket/bq-dataset → s3-storage-folder/aurora-database GCP output is byte-for-byte identical to before. Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 201 +++++++++++++------ 1 file changed, 142 insertions(+), 59 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 93c33315c..19035e6be 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -2194,6 +2194,8 @@ generate_claude_md() { # Set platform-specific template content (generator branches; output file is clean, no conditionals) local storage_bucket_type storage_save_cmd resource_table_rows + local mcp_data_resources_rows cloud_cli_section cloud_path_hint env_var_example + local data_preview_query_section create_resources_section if [ "$ws_cloud" = "AWS" ]; then storage_bucket_type="S3 bucket" storage_save_cmd='aws s3 cp s3:///' @@ -2201,6 +2203,75 @@ generate_claude_md() { | `AWS_AURORA_DATABASE` | Aurora PostgreSQL database | `wb resource create aurora-database` | | `AWS_AURORA_DATABASE_REFERENCE` | Aurora DB reference (external) | `wb resource add-ref aurora-database` | | `GIT_REPO` | Git repository reference | `wb resource add-ref git-repo` |' + + mcp_data_resources_rows='| `workspace_list_data_collections` | N/A | **List data collections and their resources** | +| `workspace_list_resources` | `wb resource list` | List all resources in the workspace | +| `resource_list_tree` | `wb resource list-tree` | List resources organized by folder | +| `list_files` | `aws s3 ls` | List files in an S3 storage folder | +| `read_file` | `aws s3 cp -` | Read contents of a file from S3 | +| `resource_create_bucket` | `wb resource create s3-storage-folder` | Create a new S3 storage folder | +| `resource_delete` | `wb resource delete` | Delete a resource | +| `resource_check_access` | — | Check if IAM role has access to a resource |' + + cloud_cli_section='### Cloud CLIs + +No direct AWS CLI MCP wrapper — use `aws` CLI commands in the terminal: +- **S3**: `aws s3 ls s3:///`, `aws s3 cp ` +- **Batch**: `aws batch list-jobs --job-queue --job-status FAILED` +- **Aurora**: `psql "host= port=5432 dbname= user="`' + + cloud_path_hint='# Look for: bucketName+prefix (S3), rwEndpoint+port+databaseName (Aurora), gitRepoUrl' + + env_var_example='echo $WORKBENCH_my_bucket # → s3://bucket/prefix +env | grep WORKBENCH_ # List all' + + data_preview_query_section='**S3:** +```bash +aws s3 ls s3://// +aws s3 cp s3:////file.csv - | head -20 +``` + +**Aurora PostgreSQL:** +```bash +# Get endpoint from wb CLI +wb resource describe --format=json | jq .rwEndpoint +# Connect +psql "host= port= dbname= user=" +# \dt → list tables; SELECT * FROM table_name LIMIT 10; +``` + +### Query Data + +**Python:** +```python +import boto3, pandas as pd + +# Read CSV from S3 +s3 = boto3.client("s3") +obj = s3.get_object(Bucket="", Key="/file.csv") +df = pd.read_csv(obj["Body"]) + +# Read Parquet directly (requires s3fs) +df = pd.read_parquet("s3:////file.parquet") + +# Aurora PostgreSQL +import psycopg2 +conn = psycopg2.connect(host="", port=, dbname="", user="", password="") +df = pd.read_sql("SELECT * FROM table_name LIMIT 100", conn) +conn.close() +```' + + create_resources_section='```bash +# S3 storage folder +wb resource create s3-storage-folder --name my-storage --description "My storage folder" + +# Aurora PostgreSQL database +wb resource create aurora-database --name my-db --description "My database" + +# Reference an external Aurora database +wb resource add-ref aurora-database --name external-db +```' + else storage_bucket_type="GCS bucket" storage_save_cmd='gsutil cp gs:///' @@ -2209,6 +2280,71 @@ generate_claude_md() { | `GIT_REPO` | Git repository reference | `wb resource add-ref git-repo` | | `GCS_OBJECT` | Individual GCS file reference | `wb resource add-ref gcs-object` | | `BQ_TABLE` | BigQuery table reference | `wb resource add-ref bq-table` |' + + mcp_data_resources_rows='| `workspace_list_data_collections` | N/A | **List data collections and their resources** | +| `workspace_list_resources` | `wb resource list` | List all resources in the workspace | +| `resource_list_tree` | `wb resource list-tree` | List resources organized by folder | +| `bq_execute` | `bq query` | Run SQL queries against BigQuery | +| `list_files` | `gsutil ls` | List files in a GCS bucket | +| `read_file` | `gsutil cat` | Read contents of a file | +| `resource_create_bucket` | `wb resource create gcs-bucket` | Create a new GCS bucket | +| `resource_delete` | `wb resource delete` | Delete a resource | +| `resource_check_access` | — | Check if service account has access to a resource | +| `resource_mount` / `resource_unmount` | — | Mount/unmount a GCS bucket |' + + cloud_cli_section='### Cloud CLIs (via MCP) + +| MCP Tool | Description | +|----------|-------------| +| `gcloud_execute` | Run any `gcloud` command | +| `gsutil_execute` | Run any `gsutil` command | +| `bq_execute` | Run any `bq` SQL query |' + + cloud_path_hint='# Look for: bucketName, projectId+datasetId, gitRepoUrl' + + env_var_example='echo $WORKBENCH_my_bucket # → gs://actual-bucket-name +env | grep WORKBENCH_ # List all' + + data_preview_query_section='**BigQuery:** +```bash +bq head -n 10 :.
+bq show --schema :.
+bq query --use_legacy_sql=false '"'"'SELECT * FROM `project.dataset.table` LIMIT 10'"'"' +``` + +**GCS:** +```bash +gsutil ls gs:/// +gsutil cat -r 0-1024 gs:///path/file.csv +``` + +### Query Data + +**CLI:** +```bash +bq query --use_legacy_sql=false '"'"'SELECT col1, col2 FROM `project.dataset.table` LIMIT 100'"'"' +``` + +**Python:** +```python +from google.cloud import bigquery +client = bigquery.Client() +df = client.query("SELECT * FROM `project.dataset.table` LIMIT 100").to_dataframe() + +import pandas as pd +df = pd.read_parquet("gs://bucket-name/path/file.parquet") +```' + + create_resources_section='```bash +# GCS bucket +wb resource create gcs-bucket --name my-bucket --description "My bucket" + +# BigQuery dataset +wb resource create bq-dataset --name my-dataset --description "My dataset" + +# Reference external GCS bucket +wb resource add-ref gcs-bucket --name external-data --bucket-name existing-bucket +```' fi # Generate dynamic sections @@ -2306,16 +2442,7 @@ ${bucket_list} | MCP Tool | CLI Equivalent | Description | |----------|----------------|-------------| -| \`workspace_list_data_collections\` | N/A | **List data collections and their resources** | -| \`workspace_list_resources\` | \`wb resource list\` | List all resources in the workspace | -| \`resource_list_tree\` | \`wb resource list-tree\` | List resources organized by folder | -| \`bq_execute\` | \`bq query\` | Run SQL queries against BigQuery | -| \`list_files\` | \`gsutil ls\` | List files in a GCS bucket | -| \`read_file\` | \`gsutil cat\` | Read contents of a file | -| \`resource_create_bucket\` | \`wb resource create gcs-bucket\` | Create a new GCS bucket | -| \`resource_delete\` | \`wb resource delete\` | Delete a resource | -| \`resource_check_access\` | — | Check if service account has access to a resource | -| \`resource_mount\` / \`resource_unmount\` | — | Mount/unmount a GCS bucket | +${mcp_data_resources_rows} ### Apps & Workflows @@ -2347,13 +2474,7 @@ ${bucket_list} | \`cohort_count_instances\` | Count members in a cohort | | \`export_cohort\` | Export cohort data to a bucket | -### Cloud CLIs (via MCP) - -| MCP Tool | Description | -|----------|-------------| -| \`gcloud_execute\` | Run any \`gcloud\` command | -| \`gsutil_execute\` | Run any \`gsutil\` command | -| \`bq_execute\` | Run any \`bq\` SQL query | +${cloud_cli_section} **Not available via MCP (use CLI):** \`wb workspace set\`, \`wb auth login\`, \`wb workflow logs\` @@ -2409,47 +2530,18 @@ wb resource list --format=json | jq '.[] | {name: .id, type: .resourceType}' \`\`\`bash wb resource describe --format=json -# Look for: bucketName, projectId+datasetId, gitRepoUrl +${cloud_path_hint} \`\`\` ### Use Environment Variables (Easiest) \`\`\`bash -echo \$WORKBENCH_my_bucket # → gs://actual-bucket-name -env | grep WORKBENCH_ # List all +${env_var_example} \`\`\` ### Preview Data -**BigQuery:** -\`\`\`bash -bq head -n 10 :.
-bq show --schema :.
-bq query --use_legacy_sql=false 'SELECT * FROM \`project.dataset.table\` LIMIT 10' -\`\`\` - -**GCS:** -\`\`\`bash -gsutil ls gs:/// -gsutil cat -r 0-1024 gs:///path/file.csv -\`\`\` - -### Query Data - -**CLI:** -\`\`\`bash -bq query --use_legacy_sql=false 'SELECT col1, col2 FROM \`project.dataset.table\` LIMIT 100' -\`\`\` - -**Python:** -\`\`\`python -from google.cloud import bigquery -client = bigquery.Client() -df = client.query("SELECT * FROM \`project.dataset.table\` LIMIT 100").to_dataframe() - -import pandas as pd -df = pd.read_parquet('gs://bucket-name/path/file.parquet') -\`\`\` +${data_preview_query_section} --- @@ -2473,16 +2565,7 @@ wb workflow logs ## How to Create Resources -\`\`\`bash -# GCS bucket -wb resource create gcs-bucket --name my-bucket --description "My bucket" - -# BigQuery dataset -wb resource create bq-dataset --name my-dataset --description "My dataset" - -# Reference external GCS bucket -wb resource add-ref gcs-bucket --name external-data --bucket-name existing-bucket -\`\`\` +${create_resources_section} --- From 5e9f164b825ea2e3603bf65aa9a79ac1baf0dc04 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 5 May 2026 16:55:34 -0400 Subject: [PATCH 60/86] style: clean up informal comment in generate_bucket_list Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 19035e6be..c0ffa90aa 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -2147,7 +2147,7 @@ generate_bucket_list() { echo "|-------------|-------------|-------------|" echo "$buckets" | jq -r '.[] | "| `s3://\(.bucketName // "unknown")/\(.prefix // "")` | `\(.id // "—")` | \(.description // "—" | if . == "" then "—" else . end) |"' 2>/dev/null || true else - # GCP — unchanged + # GCP local buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "GCS_BUCKET")]' 2>/dev/null || echo "[]") local count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") From fab65be53f131b316797def18bb4866fa8d34ea7 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 5 May 2026 17:06:24 -0400 Subject: [PATCH 61/86] fix: guarantee single valid JSON object from generate_embedded_json The resource_paths and env_vars variables were occasionally containing multiple jq output objects separated by embedded newlines (e.g. when wb resource list returns non-array JSON). --argjson rejects multi-value strings, causing the embedded JSON block in CLAUDE.md to be empty. Fix: pipe first jq output through `jq -cs 'add // {}'` which slurps all outputs into a single merged object regardless of how many the upstream jq produced. Also deduplicate the path expression into a shared variable to keep both maps consistent. Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 75 +++++++------------- 1 file changed, 25 insertions(+), 50 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index c0ffa90aa..66eafa726 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -2063,60 +2063,35 @@ generate_embedded_json() { # Generate resourcePaths map: resource name -> cloud path # Two-step declaration so failures fall back to '{}' rather than propagating + # jq expression shared by both maps — normalise to one resource path value per resource. + # The outer `| jq -cs 'add // {}'` slurps all jq outputs (which may be multiple objects + # if the input contained non-array JSON) into a single merged object, guaranteeing that + # the variable always contains exactly one valid JSON value for --argjson. + local _resource_path_expr=' + if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" + elif .resourceType == "AWS_S3_STORAGE_FOLDER" then "s3://\(.bucketName // "unknown")/\(.prefix // "")" + elif .resourceType == "AWS_AURORA_DATABASE" then "\(.rwEndpoint // "unknown"):\(.port // "5432")/\(.databaseName // "")" + elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" + elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" + elif .resourceType == "GIT_REPO" then .gitRepoUrl + elif .resourceType == "GCS_OBJECT" then "gs://\(.bucketName)/\(.objectName // "")" + else null + end' + local resource_paths - resource_paths=$(echo "$resources" | jq -c ' - map( - { - key: .id, - value: ( - if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" - elif .resourceType == "AWS_S3_STORAGE_FOLDER" then "s3://\(.bucketName // "unknown")/\(.prefix // "")" - elif .resourceType == "AWS_AURORA_DATABASE" then "\(.rwEndpoint // "unknown"):\(.port // "5432")/\(.databaseName // "")" - elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" - elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" - elif .resourceType == "GIT_REPO" then .gitRepoUrl - elif .resourceType == "GCS_OBJECT" then "gs://\(.bucketName)/\(.objectName // "")" - else null - end - ) - } - ) | map(select(.value != null)) | from_entries - ') || resource_paths='{}' - - # Generate envVars map: WORKBENCH_ -> cloud path + resource_paths=$(printf '%s' "$resources" | jq -c " + map({ key: .id, value: ( $_resource_path_expr ) }) + | map(select(.value != null)) | from_entries + " 2>/dev/null | jq -cs 'add // {}' 2>/dev/null) || resource_paths='{}' + local env_vars - env_vars=$(echo "$resources" | jq -c ' - map( - { - key: ("WORKBENCH_" + (.id | gsub("-"; "_"))), - value: ( - if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" - elif .resourceType == "AWS_S3_STORAGE_FOLDER" then "s3://\(.bucketName // "unknown")/\(.prefix // "")" - elif .resourceType == "AWS_AURORA_DATABASE" then "\(.rwEndpoint // "unknown"):\(.port // "5432")/\(.databaseName // "")" - elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" - elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" - elif .resourceType == "GIT_REPO" then .gitRepoUrl - elif .resourceType == "GCS_OBJECT" then "gs://\(.bucketName)/\(.objectName // "")" - else null - end - ) - } - ) | map(select(.value != null)) | from_entries - ') || env_vars='{}' - - # Validate each value is parseable JSON before passing to --argjson. - # jq-produced output should always be valid, but a corrupt $resources string - # can leave these as empty or multi-line values that --argjson rejects. + env_vars=$(printf '%s' "$resources" | jq -c " + map({ key: (\"WORKBENCH_\" + (.id | gsub(\"-\"; \"_\"))), value: ( $_resource_path_expr ) }) + | map(select(.value != null)) | from_entries + " 2>/dev/null | jq -cs 'add // {}' 2>/dev/null) || env_vars='{}' + resource_paths="${resource_paths:-{}}" env_vars="${env_vars:-{}}" - if ! printf '%s' "$resource_paths" | jq empty 2>/dev/null; then - log_error "resource_paths is not valid JSON (value: ${resource_paths:0:120}); falling back to {}" >&2 - resource_paths='{}' - fi - if ! printf '%s' "$env_vars" | jq empty 2>/dev/null; then - log_error "env_vars is not valid JSON (value: ${env_vars:0:120}); falling back to {}" >&2 - env_vars='{}' - fi jq -n \ --argjson resource_paths "$resource_paths" \ --argjson env_vars "$env_vars" \ From fba020e48845ece191b2e34d909781016bfee674 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 5 May 2026 17:24:37 -0400 Subject: [PATCH 62/86] fix: rewrite generate_embedded_json to avoid --argjson entirely The previous approach captured intermediate jq output into bash variables then passed them via --argjson, which fails when the variables contain embedded newlines or encoding edge cases on certain jq versions. Rewrite as a single jq invocation that builds both resourcePaths and envVars maps directly from the resource list. A jq `def` avoids repeating the path expression. `head -1` guarantees one output line regardless of what wb resource list returns. The bash fallback ensures a valid empty JSON object is always returned even if jq fails completely. Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 63 ++++++++------------ 1 file changed, 24 insertions(+), 39 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 66eafa726..5da449232 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -2060,45 +2060,30 @@ fetch_apps() { # Generate embedded JSON (returns JSON to stdout, doesn't write to file) generate_embedded_json() { local resources="$1" - - # Generate resourcePaths map: resource name -> cloud path - # Two-step declaration so failures fall back to '{}' rather than propagating - # jq expression shared by both maps — normalise to one resource path value per resource. - # The outer `| jq -cs 'add // {}'` slurps all jq outputs (which may be multiple objects - # if the input contained non-array JSON) into a single merged object, guaranteeing that - # the variable always contains exactly one valid JSON value for --argjson. - local _resource_path_expr=' - if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" - elif .resourceType == "AWS_S3_STORAGE_FOLDER" then "s3://\(.bucketName // "unknown")/\(.prefix // "")" - elif .resourceType == "AWS_AURORA_DATABASE" then "\(.rwEndpoint // "unknown"):\(.port // "5432")/\(.databaseName // "")" - elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" - elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" - elif .resourceType == "GIT_REPO" then .gitRepoUrl - elif .resourceType == "GCS_OBJECT" then "gs://\(.bucketName)/\(.objectName // "")" - else null - end' - - local resource_paths - resource_paths=$(printf '%s' "$resources" | jq -c " - map({ key: .id, value: ( $_resource_path_expr ) }) - | map(select(.value != null)) | from_entries - " 2>/dev/null | jq -cs 'add // {}' 2>/dev/null) || resource_paths='{}' - - local env_vars - env_vars=$(printf '%s' "$resources" | jq -c " - map({ key: (\"WORKBENCH_\" + (.id | gsub(\"-\"; \"_\"))), value: ( $_resource_path_expr ) }) - | map(select(.value != null)) | from_entries - " 2>/dev/null | jq -cs 'add // {}' 2>/dev/null) || env_vars='{}' - - resource_paths="${resource_paths:-{}}" - env_vars="${env_vars:-{}}" - jq -n \ - --argjson resource_paths "$resource_paths" \ - --argjson env_vars "$env_vars" \ - '{ - "resourcePaths": $resource_paths, - "envVars": $env_vars - }' + + # Build both maps in a single jq invocation so no intermediate bash variables + # are passed via --argjson (which is sensitive to embedded newlines and encoding + # edge cases on some jq versions). A jq `def` avoids repeating the path expression. + # `(if type == "array" then . else [] end)` guards against non-array input. + local result + result=$(printf '%s' "${resources:-[]}" | jq -c ' + def cloud_path: + if .resourceType == "GCS_BUCKET" then "gs://\(.bucketName)" + elif .resourceType == "AWS_S3_STORAGE_FOLDER" then "s3://\(.bucketName // "unknown")/\(.prefix // "")" + elif .resourceType == "AWS_AURORA_DATABASE" then "\(.rwEndpoint // "unknown"):\(.port // "5432")/\(.databaseName // "")" + elif .resourceType == "BQ_DATASET" then "\(.projectId).\(.datasetId)" + elif .resourceType == "BQ_TABLE" then "\(.projectId).\(.datasetId).\(.tableId // "")" + elif .resourceType == "GIT_REPO" then .gitRepoUrl + elif .resourceType == "GCS_OBJECT" then "gs://\(.bucketName)/\(.objectName // "")" + else null end; + (if type == "array" then . else [] end) | + { + "resourcePaths": (map({key: .id, value: cloud_path}) | map(select(.value != null)) | from_entries), + "envVars": (map({key: ("WORKBENCH_" + (.id | gsub("-";"_"))), value: cloud_path}) | map(select(.value != null)) | from_entries) + } + ' 2>/dev/null | head -1) + + printf '%s\n' "${result:-{\"resourcePaths\":{},\"envVars\":{}}}" } # Generate bucket list for data persistence section From 674d97ca68d6a87e5f547a21a3de0c060f472939 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 5 May 2026 17:28:12 -0400 Subject: [PATCH 63/86] docs(aws): add IAM auth and SSL instructions for Aurora database access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aurora in Workbench requires IAM database authentication — static passwords are rejected with PAM authentication failed or no encryption errors. Updates AWS DASHBOARD_BUILDER skill only (GCP skill unchanged): - Template 2 rewritten with the correct 4-step IAM auth flow: wb resource credentials → boto3 IAM token → psycopg2 with sslmode=require - Aurora troubleshooting section expanded with symptoms, step-by-step fix, and AWS CLI alternative using generate-db-auth-token - Checklist and quick reference table include Aurora IAM and SSL items Updates AWS data_preview_query_section in CLAUDE.md generation (AWS only): - Aurora bash preview uses generate-db-auth-token plus PGSSLMODE=require - Aurora Python example uses full wb credentials → boto3 → psycopg2 flow Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 183 +++++++++++++++---- 1 file changed, 149 insertions(+), 34 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 5da449232..4ba033d7c 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -1883,36 +1883,68 @@ if __name__ == '__main__': ### Template 2: Aurora PostgreSQL Dashboard +Aurora in Workbench uses **IAM database authentication** — you cannot connect with a static +password. The correct flow is: + +1. Get temporary AWS credentials via `wb resource credentials` +2. Generate an IAM auth token via boto3 (token is valid for 15 minutes) +3. Connect with `sslmode='require'` — **SSL is mandatory; connections are rejected without it** + ```python -import psycopg2 -import pandas as pd -import os +import json, subprocess, boto3, psycopg2, pandas as pd, os + +def get_aurora_connection(resource_id: str, username: str): + """ + Returns an open psycopg2 connection to a Workbench-managed Aurora database. + resource_id: the Workbench resource ID (e.g. 'test-db-1') + username: the IAM database user (check with your workspace admin) + """ + # Step 1 — get temporary AWS credentials from Workbench + result = subprocess.run( + ['wb', 'resource', 'credentials', + f'--id={resource_id}', '--scope=WRITE_READ', '--format=json'], + capture_output=True, text=True, check=True + ) + creds = json.loads(result.stdout) + + # Step 2 — parse connection details from WORKBENCH_* env var + # Format: "host:port/dbname" e.g. "abc.cluster.us-west-2.rds.amazonaws.com:5432/mydb" + conn_str = os.environ.get(f'WORKBENCH_{resource_id.replace("-", "_")}', '') + host_part, _, dbname = conn_str.partition('/') + host, _, port = host_part.partition(':') + port = int(port) if port else 5432 + + # Step 3 — generate IAM auth token (valid 15 min) + session = boto3.Session( + aws_access_key_id=creds['AccessKeyId'], + aws_secret_access_key=creds['SecretAccessKey'], + aws_session_token=creds['SessionToken'], + region_name='us-west-2' + ) + auth_token = session.client('rds').generate_db_auth_token( + DBHostname=host, Port=port, DBUsername=username, Region='us-west-2' + ) + + # Step 4 — connect with SSL (REQUIRED — Aurora rejects unencrypted connections) + return psycopg2.connect( + host=host, port=port, database=dbname, + user=username, password=auth_token, + sslmode='require' # mandatory — omitting this causes "PAM authentication failed" + ) def get_data_from_aurora(): global _data_cache if _data_cache is not None: return _data_cache - - # WORKBENCH_ contains "host:port/dbname" — use wb to get credentials: - # wb resource resolve aurora-database --id= - # Or hard-code connection details after running the above command once. - conn_ref = os.environ.get('WORKBENCH_my_aurora_db', '').split('/') - host_port = conn_ref[0].split(':') if conn_ref[0] else ['your-aurora-endpoint', '5432'] - host = host_port[0] - port = host_port[1] if len(host_port) > 1 else '5432' - dbname = conn_ref[1] if len(conn_ref) > 1 else 'your-db-name' - - conn = psycopg2.connect( - host=host, port=port, dbname=dbname, - user='your-user', password='your-password' - ) + conn = get_aurora_connection('test-db-1', 'your-iam-username') df = pd.read_sql('SELECT * FROM your_table LIMIT 1000', conn) conn.close() _data_cache = df.to_dict(orient='records') return _data_cache ``` -> **Tip:** Use `wb resource resolve aurora-database --id=` to get the connection string, or check the `WORKBENCH_*` env vars populated by Workbench context generation. +> **Why IAM auth?** Workbench-managed Aurora databases are configured for IAM authentication only. +> Static passwords will fail with "PAM authentication failed" or "pg_hba.conf rejects connection". ### Alternative: Embed Data in HTML (For Static Dashboards) @@ -1975,12 +2007,64 @@ env | grep WORKBENCH ### Aurora connection errors +Aurora requires IAM authentication + SSL. Plain password connections are rejected. + +**Symptoms and causes:** +- `"PAM authentication failed"` → not using IAM auth token as password +- `"pg_hba.conf rejects connection... no encryption"` → missing `sslmode='require'` +- `"SSL connection is required"` → same SSL issue + +**Step-by-step fix:** + +```bash +# 1. Get temporary credentials from Workbench (scoped to this resource) +wb resource credentials --id= --scope=WRITE_READ --format=json +# Returns: {"AccessKeyId":"...","SecretAccessKey":"...","SessionToken":"..."} +``` + +```python +import boto3, psycopg2, json, subprocess + +# 2. Generate IAM auth token +result = subprocess.run( + ['wb', 'resource', 'credentials', '--id=', '--scope=WRITE_READ', '--format=json'], + capture_output=True, text=True, check=True +) +creds = json.loads(result.stdout) + +session = boto3.Session( + aws_access_key_id=creds['AccessKeyId'], + aws_secret_access_key=creds['SecretAccessKey'], + aws_session_token=creds['SessionToken'], + region_name='us-west-2' +) +auth_token = session.client('rds').generate_db_auth_token( + DBHostname='', Port=5432, + DBUsername='', Region='us-west-2' +) + +# 3. Connect with SSL enabled (mandatory) +conn = psycopg2.connect( + host='', port=5432, database='', + user='', password=auth_token, + sslmode='require' # CRITICAL — without this, connection is rejected +) +``` + +**AWS CLI alternative (to verify the token works):** ```bash -# Get connection string from wb CLI -wb resource resolve aurora-database --id= +# Export the credentials first +export AWS_ACCESS_KEY_ID="..." +export AWS_SECRET_ACCESS_KEY="..." +export AWS_SESSION_TOKEN="..." + +# Generate auth token +TOKEN=$(aws rds generate-db-auth-token \ + --hostname --port 5432 \ + --region us-west-2 --username ) -# Test connectivity -psql "host= port=5432 dbname= user=" +# Connect (psql requires SSL flag) +PGSSLMODE=require psql "host= port=5432 dbname= user= password=$TOKEN" ``` ### Server not accessible through proxy @@ -2003,6 +2087,8 @@ app.run(host='0.0.0.0', port=8080) - [ ] **Data cached** - Avoid repeated S3 reads - [ ] **Error handling** - API returns errors as JSON, not crashes - [ ] **CORS enabled** - `CORS(app)` added +- [ ] **Aurora: IAM auth** - Using `wb resource credentials` + boto3 token, not a static password +- [ ] **Aurora: SSL enabled** - `sslmode='require'` in psycopg2.connect() --- @@ -2012,11 +2098,13 @@ app.run(host='0.0.0.0', port=8080) |-------|-------|-----| | 404 on API | Path format | Remove leading `/` from fetch | | CORS error | CORS setup | Add `CORS(app)` | -| Blank page | Server running? | `ps aux | grep python` | +| Blank page | Server running? | `ps aux \| grep python` | | S3 error | AWS credentials | `aws sts get-caller-identity` | | Wrong port | URL vs code | Match port in URL to `app.run()` | | Works locally, fails via URL | Host binding | Change `localhost` to `0.0.0.0` | | Gateway timeout | Server/UUID | Check server running + correct UUID | +| Aurora: PAM auth failed | IAM auth | Use `wb resource credentials` + boto3 token | +| Aurora: no encryption | SSL missing | Add `sslmode='require'` to psycopg2.connect() | --- @@ -2178,7 +2266,7 @@ generate_claude_md() { No direct AWS CLI MCP wrapper — use `aws` CLI commands in the terminal: - **S3**: `aws s3 ls s3:///`, `aws s3 cp ` - **Batch**: `aws batch list-jobs --job-queue --job-status FAILED` -- **Aurora**: `psql "host= port=5432 dbname= user="`' +- **Aurora**: requires IAM auth token — see Aurora connection instructions in DASHBOARD_BUILDER skill' cloud_path_hint='# Look for: bucketName+prefix (S3), rwEndpoint+port+databaseName (Aurora), gitRepoUrl' @@ -2191,32 +2279,59 @@ aws s3 ls s3://// aws s3 cp s3:////file.csv - | head -20 ``` -**Aurora PostgreSQL:** +**Aurora PostgreSQL** (requires IAM auth + SSL — plain passwords are rejected): ```bash -# Get endpoint from wb CLI -wb resource describe --format=json | jq .rwEndpoint -# Connect -psql "host= port= dbname= user=" +# Step 1: get temporary credentials from Workbench +wb resource credentials --id= --scope=WRITE_READ --format=json +# Returns: {"AccessKeyId":"...","SecretAccessKey":"...","SessionToken":"..."} + +# Step 2: export credentials, generate auth token, connect +export AWS_ACCESS_KEY_ID="..." AWS_SECRET_ACCESS_KEY="..." AWS_SESSION_TOKEN="..." +TOKEN=$(aws rds generate-db-auth-token --hostname --port 5432 --region us-west-2 --username ) +PGSSLMODE=require psql "host= port=5432 dbname= user= password=$TOKEN" # \dt → list tables; SELECT * FROM table_name LIMIT 10; ``` ### Query Data -**Python:** +**Python (S3):** ```python import boto3, pandas as pd -# Read CSV from S3 s3 = boto3.client("s3") obj = s3.get_object(Bucket="", Key="/file.csv") df = pd.read_csv(obj["Body"]) # Read Parquet directly (requires s3fs) df = pd.read_parquet("s3:////file.parquet") +``` -# Aurora PostgreSQL -import psycopg2 -conn = psycopg2.connect(host="", port=, dbname="", user="", password="") +**Python (Aurora — IAM auth required):** +```python +import json, subprocess, boto3, psycopg2 + +# Get temporary credentials from Workbench +creds = json.loads(subprocess.run( + ["wb", "resource", "credentials", "--id=", "--scope=WRITE_READ", "--format=json"], + capture_output=True, text=True, check=True +).stdout) + +# Generate IAM auth token +session = boto3.Session( + aws_access_key_id=creds["AccessKeyId"], + aws_secret_access_key=creds["SecretAccessKey"], + aws_session_token=creds["SessionToken"], + region_name="us-west-2" +) +auth_token = session.client("rds").generate_db_auth_token( + DBHostname="", Port=5432, DBUsername="", Region="us-west-2" +) + +# Connect — sslmode="require" is mandatory +conn = psycopg2.connect( + host="", port=5432, database="", + user="", password=auth_token, sslmode="require" +) df = pd.read_sql("SELECT * FROM table_name LIMIT 100", conn) conn.close() ```' From 6d6f694893af675582fbae9b8e9602fedbbf5ebf Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Mon, 11 May 2026 14:15:59 -0400 Subject: [PATCH 64/86] feat(mcp): add platform_list_data_collections tool for cross-workspace data discovery New tool searches all data collections accessible to the user platform-wide, not just those attached to the active workspace. Uses the same /api/workspaces/v2/filtered endpoint as workspace_list_all but pre-filters for terra-type=data-collection workspaces. Features: - Optional keyword filter (case-insensitive match on name and description) - Returns id, name, description, underlayName, and workspace properties - Includes scope label and attach command in response so Claude always communicates context to the user - Consistent map-based response style with rest of codebase Co-authored-by: Cursor --- features/src/wb-mcp-server/main.go | 138 +++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index 5ebc5b16a..15e9c9807 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -397,6 +397,41 @@ Returns a structured list of data collections with their resources, types, and c }, }, + { + Name: "platform_list_data_collections", + Description: `Search and list all data collections accessible to the current user across all of Workbench — not just those attached to the active workspace. + +Use this when a user asks: +- "What data collections do I have access to?" +- "Find data collections related to " +- "Search for datasets across all of Workbench" +- "What datasets could I add to my workspace?" +- "Show me all accessible genomics / proteomics / clinical data" +- "Are there any data collections I haven't attached yet?" + +This tool searches PLATFORM-WIDE. It returns all data collections the user has READ access to, regardless of whether they are attached to the active workspace. + +Always tell the user upfront that this is a broader platform-wide search (not just their workspace). + +To attach a found data collection to the current workspace: + wb data-collection add-to-workspace --id= + +Returns data collection names, IDs, descriptions, and underlay (data model) names where available.`, + InputSchema: InputSchema{ + Type: "object", + Properties: map[string]interface{}{ + "query": map[string]interface{}{ + "type": "string", + "description": "Optional keyword to filter data collections by name or description (case-insensitive substring match)", + }, + "limit": map[string]interface{}{ + "type": "number", + "description": "Maximum number of results to return (default: 100)", + }, + }, + }, + }, + { Name: "group_create", Description: "Create a user group. Use this when managing multiple users with same access needs. Groups simplify permission management - grant access to group instead of individual users.", @@ -1688,6 +1723,109 @@ func handleCallTool(params CallToolParams) CallToolResult { output = string(respBody) } + case "platform_list_data_collections": + // Fetch all data collections accessible to the user across all workspaces. + // Data collections are workspaces with the property terra-type=data-collection. + limit := 100 + if l, ok := params.Arguments["limit"].(float64); ok { + limit = int(l) + } + query := "" + if q, ok := params.Arguments["query"].(string); ok { + query = strings.ToLower(strings.TrimSpace(q)) + } + + body := map[string]interface{}{ + "limit": limit, + "offset": 0, + "properties": []map[string]string{ + {"key": "terra-type", "value": "data-collection"}, + }, + } + respBody, apiErr := makeAPIRequest("POST", workspaceBaseURL+"/api/workspaces/v2/filtered", body) + if apiErr != nil { + err = apiErr + break + } + + var wsData map[string]interface{} + if jsonErr := json.Unmarshal(respBody, &wsData); jsonErr != nil { + err = fmt.Errorf("failed to parse response: %w", jsonErr) + break + } + + workspaces, _ := wsData["workspaces"].([]interface{}) + if workspaces == nil { + workspaces = []interface{}{} + } + + var collections []map[string]interface{} + for _, w := range workspaces { + ws, ok := w.(map[string]interface{}) + if !ok { + continue + } + + id, _ := ws["id"].(string) + name, _ := ws["displayName"].(string) + if name == "" { + name, _ = ws["userFacingId"].(string) + } + desc, _ := ws["description"].(string) + + // Apply optional keyword filter against name and description + if query != "" { + nameMatch := strings.Contains(strings.ToLower(name), query) + descMatch := strings.Contains(strings.ToLower(desc), query) + if !nameMatch && !descMatch { + continue + } + } + + dc := map[string]interface{}{ + "id": id, + "name": name, + } + if desc != "" { + dc["description"] = desc + } + + // Extract workspace properties — pick out underlay name if present + if propsArray, ok := ws["properties"].([]interface{}); ok { + props := make(map[string]string) + for _, p := range propsArray { + if prop, ok := p.(map[string]interface{}); ok { + k, _ := prop["key"].(string) + v, _ := prop["value"].(string) + props[k] = v + } + } + if underlayName, ok := props["terra-dx-underlay-name"]; ok && underlayName != "" { + dc["underlayName"] = underlayName + } + dc["properties"] = props + } + + collections = append(collections, dc) + } + + if collections == nil { + collections = []map[string]interface{}{} + } + + result := map[string]interface{}{ + "dataCollections": collections, + "total": len(collections), + "scope": "platform-wide (all data collections you have READ access to)", + "attachCommand": "wb data-collection add-to-workspace --id=", + } + resultBytes, marshalErr := json.MarshalIndent(result, "", " ") + if marshalErr != nil { + err = fmt.Errorf("failed to marshal result: %w", marshalErr) + } else { + output = string(resultBytes) + } + case "workspace_get": workspaceId, ok := params.Arguments["workspaceId"].(string) if !ok { From d22009e718f8a4650b5a2b44647ac32ac2a0bf01 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Mon, 11 May 2026 17:45:53 -0400 Subject: [PATCH 65/86] feat(mcp): enrich platform_list_data_collections with full terra property metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All data collection metadata is stored as terra-* workspace properties — no additional API calls or workspace context switches needed. Extracts and returns: - shortDescription, organization, availability, isFree, isInstantlyAccessible - patientCount, timeFrame, geographicCoverage, dataModel - dataModalityTags, therapeuticTags - underlayName, dataDictionary, usageExamples (incl. sample SQL queries) - accessGroupName, supportEmail - dataPublished, metadataLastUpdated, externalDocumentation Broadens keyword search to match against modality tags, therapeutic tags, and data model type in addition to name and description. Co-authored-by: Cursor --- features/src/wb-mcp-server/main.go | 139 ++++++++++++++++++++++++----- 1 file changed, 115 insertions(+), 24 deletions(-) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index 15e9c9807..fd3643060 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -406,17 +406,31 @@ Use this when a user asks: - "Find data collections related to " - "Search for datasets across all of Workbench" - "What datasets could I add to my workspace?" -- "Show me all accessible genomics / proteomics / clinical data" +- "Show me all accessible genomics / proteomics / clinical / imaging data" - "Are there any data collections I haven't attached yet?" +- "Find me datasets about " -This tool searches PLATFORM-WIDE. It returns all data collections the user has READ access to, regardless of whether they are attached to the active workspace. +This tool searches PLATFORM-WIDE. It returns all data collections the user has READ access to, +regardless of whether they are attached to the active workspace. Always tell the user upfront that this is a broader platform-wide search (not just their workspace). -To attach a found data collection to the current workspace: - wb data-collection add-to-workspace --id= +The keyword search matches against: name, description, data modality tags, therapeutic area tags, +and data model type — so queries like "imaging", "genomics", "oncology" will match relevant collections. -Returns data collection names, IDs, descriptions, and underlay (data model) names where available.`, +Each result includes rich metadata sourced directly from the data collection: +- shortDescription, description, organization, availability, isFree, isInstantlyAccessible +- patientCount, timeFrame, geographicCoverage, dataModel +- dataModalityTags (e.g. imaging, lab-results, ecrf), therapeuticTags (e.g. oncology, general-health) +- underlayName (the data model identifier for schema exploration) +- dataDictionary (links to schema documentation) +- usageExamples (sample use cases and SQL queries) +- accessGroupName, supportEmail +- dataPublished, metadataLastUpdated, externalDocumentation + +Present results in a human-readable summary grouped by relevance. For each matching collection, +highlight the most relevant fields for the user's query (e.g. patient count and modality for +clinical searches, underlay name for schema exploration).`, InputSchema: InputSchema{ Type: "object", Properties: map[string]interface{}{ @@ -1773,37 +1787,114 @@ func handleCallTool(params CallToolParams) CallToolResult { } desc, _ := ws["description"].(string) - // Apply optional keyword filter against name and description + // Extract all terra-* workspace properties into a flat map + props := make(map[string]string) + if propsArray, ok := ws["properties"].([]interface{}); ok { + for _, p := range propsArray { + if prop, ok := p.(map[string]interface{}); ok { + k, _ := prop["key"].(string) + v, _ := prop["value"].(string) + props[k] = v + } + } + } + + // Apply optional keyword filter across name, description, short description, + // modality tags, and therapeutic tags so searches like "genomics" or "imaging" work if query != "" { - nameMatch := strings.Contains(strings.ToLower(name), query) - descMatch := strings.Contains(strings.ToLower(desc), query) - if !nameMatch && !descMatch { + searchTargets := strings.Join([]string{ + strings.ToLower(name), + strings.ToLower(desc), + strings.ToLower(props["terra-workspace-short-description"]), + strings.ToLower(props["terra-data-modality-tags"]), + strings.ToLower(props["terra-therapeutic-tags"]), + strings.ToLower(props["terra-dc-data-model"]), + }, " ") + if !strings.Contains(searchTargets, query) { continue } } + // Build structured result with all meaningful metadata fields dc := map[string]interface{}{ "id": id, "name": name, } + + // Overview + if v := props["terra-workspace-short-description"]; v != "" { + dc["shortDescription"] = v + } if desc != "" { dc["description"] = desc } + if v := props["terra-organization-name"]; v != "" { + dc["organization"] = v + } + if v := props["terra-dc-availability"]; v != "" { + dc["availability"] = v + } + if v := props["terra-dc-is-free"]; v != "" { + dc["isFree"] = v == "true" + } + if v := props["terra-is-instantly-accessible"]; v != "" { + dc["isInstantlyAccessible"] = v == "true" + } - // Extract workspace properties — pick out underlay name if present - if propsArray, ok := ws["properties"].([]interface{}); ok { - props := make(map[string]string) - for _, p := range propsArray { - if prop, ok := p.(map[string]interface{}); ok { - k, _ := prop["key"].(string) - v, _ := prop["value"].(string) - props[k] = v - } - } - if underlayName, ok := props["terra-dx-underlay-name"]; ok && underlayName != "" { - dc["underlayName"] = underlayName - } - dc["properties"] = props + // Data characteristics + if v := props["terra-dc-patient-count"]; v != "" { + dc["patientCount"] = v + } + if v := props["terra-dc-time-frame"]; v != "" { + dc["timeFrame"] = v + } + if v := props["terra-dc-geographic-coverage"]; v != "" { + dc["geographicCoverage"] = v + } + if v := props["terra-dc-data-model"]; v != "" { + dc["dataModel"] = v + } + if v := props["terra-data-modality-tags"]; v != "" { + dc["dataModalityTags"] = v + } + if v := props["terra-therapeutic-tags"]; v != "" { + dc["therapeuticTags"] = v + } + + // Schema / underlay + if v := props["terra-dx-underlay-name"]; v != "" { + dc["underlayName"] = v + } + + // Data dictionary + if v := props["terra-dc-data-dictionary"]; v != "" { + dc["dataDictionary"] = v + } + + // Usage examples (includes sample queries) + if v := props["terra-dc-usage-examples-sample-use-cases"]; v != "" { + dc["usageExamples"] = v + } + + // Access + if v := props["terra-access-group-name"]; v != "" { + dc["accessGroupName"] = v + } + if v := props["terra-support-email"]; v != "" { + dc["supportEmail"] = v + } + + // Publication / freshness + if v := props["terra-dc-data-published"]; v != "" { + dc["dataPublished"] = v + } + if v := props["terra-dc-metadata-last-updated"]; v != "" { + dc["metadataLastUpdated"] = v + } + + // External documentation + if v := props["terra-dc-external-documentation"]; v != "" { + dc["externalDocumentation"] = v } collections = append(collections, dc) @@ -1817,7 +1908,7 @@ func handleCallTool(params CallToolParams) CallToolResult { "dataCollections": collections, "total": len(collections), "scope": "platform-wide (all data collections you have READ access to)", - "attachCommand": "wb data-collection add-to-workspace --id=", + "attachCommand": "wb workspace clone --id= # or ask your workspace admin to attach the collection", } resultBytes, marshalErr := json.MarshalIndent(result, "", " ") if marshalErr != nil { From cc2362ea255a9bcfba59fe09997fb52effa20076 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Mon, 11 May 2026 17:57:16 -0400 Subject: [PATCH 66/86] feat: add DATA_DISCOVERY skill and enrich platform_list_data_collections URLs MCP tool (main.go): - Extract userFacingId and construct workbenchUrl per collection (https://workbench.verily.com/data-collections/) - Return uuid separately for API use vs id for UI links Skill (DATA_DISCOVERY.md): - Triggers on workspace-scoped and platform-wide data discovery prompts - Step 0: always asks user to clarify search scope before proceeding - Step 1: clarifies search criteria (modality, disease, population, access) - Step 2: uses platform_list_data_collections MCP tool first, CLI fallback - Step 3: presents results with all rich metadata fields, offers refinement - Step 4: provides workbenchUrl and instructions to add via Workbench UI generate-context.sh: - Copies DATA_DISCOVERY.md into skills directory at context generation time - Registers skill in CLAUDE.md skills table and trigger guide Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 16 ++ .../src/llm-context/skills/DATA_DISCOVERY.md | 154 ++++++++++++++++++ features/src/wb-mcp-server/main.go | 19 ++- 3 files changed, 184 insertions(+), 5 deletions(-) create mode 100644 features/src/llm-context/skills/DATA_DISCOVERY.md diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 4ba033d7c..d8210ee70 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -110,6 +110,14 @@ install_skills() { local cloud_platform="${1:-GCP}" log_info "Installing skill files..." + # Copy DATA_DISCOVERY.md skill from the feature source directory + log_info "Creating DATA_DISCOVERY.md skill..." + if [ -f "$(dirname "$0")/skills/DATA_DISCOVERY.md" ]; then + cp "$(dirname "$0")/skills/DATA_DISCOVERY.md" "${SKILLS_DIR}/DATA_DISCOVERY.md" + else + log_info "DATA_DISCOVERY.md not found in feature source, skipping" + fi + # Create CUSTOM_APP.md skill (full version, embedded) log_info "Creating CUSTOM_APP.md skill..." cat > "${SKILLS_DIR}/CUSTOM_APP.md" << 'SKILL_EOF' @@ -2696,6 +2704,7 @@ Read these directly — no index needed: | Topic | Skill File | When to Use | |-------|------------|-------------| +| **🔍 Data discovery** | \`DATA_DISCOVERY.md\` | Find data collections inside or across all of Workbench | | **🚨 Dashboards, Web UIs** | \`DASHBOARD_BUILDER.md\` | Dashboard, Flask, Streamlit, web UI, plots on a port | | Building custom apps | \`CUSTOM_APP.md\` | Deployable Workbench apps | | App templates | \`APP_TEMPLATES.md\` | Pre-built templates for dashboards, APIs, file processors | @@ -2715,6 +2724,13 @@ Read these directly — no index needed: ### ⚡ Skill Trigger Guide +**Read \`DATA_DISCOVERY.md\` when:** +- "find data collections" / "search for data" / "what data do I have access to" +- "find me a dataset" / "search across workbench" / "what datasets are available" +- "genomics data" / "clinical data" / "imaging data" / "proteomics data" +- "data I can add to my workspace" / "data collections I haven't attached" +- "find data related to [disease / topic]" + **ALWAYS read \`DASHBOARD_BUILDER.md\` FIRST when user says ANY of these:** - "create a dashboard" - "visualize data" / "show me a chart" / "display data" diff --git a/features/src/llm-context/skills/DATA_DISCOVERY.md b/features/src/llm-context/skills/DATA_DISCOVERY.md new file mode 100644 index 000000000..5e2bea5ef --- /dev/null +++ b/features/src/llm-context/skills/DATA_DISCOVERY.md @@ -0,0 +1,154 @@ +# Data Collection Discovery + +--- + +## When to Use This Skill + +Read this skill whenever the user asks about finding, searching, or exploring data collections — whether inside their active workspace or across all of Workbench. + +**Trigger phrases — workspace-scoped search:** +- "What data is in my workspace?" +- "What data collections are attached to my workspace?" +- "Show me the resources in my workspace" +- "What datasets do I have access to here?" + +**Trigger phrases — platform-wide search:** +- "Find data collections across Workbench" +- "What data do I have access to?" +- "Search for [disease / modality / population] datasets" +- "Are there any genomics / clinical / imaging datasets I can use?" +- "Find me a dataset related to [topic]" +- "What data collections exist that I haven't added yet?" +- "Show me all accessible data" +- "Is there a [cancer / diabetes / cardiovascular] dataset available?" + +--- + +## Step 0 — Clarify the Search Scope + +**Always ask the user first:** + +> "Would you like me to search only within your active workspace, or search across all data collections you have access to in Workbench (platform-wide)?" + +- **Workspace-only**: Use `workspace_list_data_collections` — fast, shows only what's already attached +- **Platform-wide**: Use `platform_list_data_collections` — broader, searches all accessible collections + +Do not assume scope. If the user's intent is ambiguous, ask. + +--- + +## Step 1 — Clarify Search Criteria + +Before searching, confirm what the user is looking for: + +- **Topic / disease area** (e.g., oncology, cardiovascular, diabetes, general health) +- **Data modality** (e.g., genomics, imaging, lab results, patient-reported outcomes, EHR/EHR-derived) +- **Population** (e.g., age range, geography, study size) +- **Access type** (free vs. controlled access, instantly accessible vs. requires approval) +- **Data model** (e.g., standard underlay like AoU, custom schema) + +If the user has already provided enough context, proceed directly to Step 2. + +--- + +## Step 2 — Search + +### Platform-wide search (primary) + +Use the MCP tool first: + +``` +mcp__wb__platform_list_data_collections(query="") +``` + +- Pass the user's topic, modality, or disease area as `query` +- The tool searches across: name, description, modality tags, therapeutic tags, data model +- If no `query` is provided, it returns all accessible data collections + +If the MCP tool is unavailable, fall back to: +```bash +wb workspace list --format=json | jq '[.[] | select(.properties[]? | select(.key=="terra-type" and .value=="data-collection"))]' +``` + +### Workspace-scoped search + +``` +mcp__wb__workspace_list_data_collections() +``` + +### Search across all returned metadata + +For each result, the tool returns the following fields — use ALL of them when evaluating relevance: + +| Field | What it tells you | +|---|---| +| `name` | Collection name | +| `shortDescription` | One-line summary | +| `description` | Full overview including provenance and methodology | +| `organization` | Who owns the data | +| `availability` | Public open access / Public controlled access / Private | +| `isFree` | Whether access is free | +| `isInstantlyAccessible` | Whether access is immediate or requires approval | +| `patientCount` | Study size | +| `timeFrame` | Date range of data collection | +| `geographicCoverage` | Countries / regions | +| `dataModel` | Schema type (e.g., standard underlay, Non-standard custom) | +| `dataModalityTags` | Types of data (imaging, lab-results, ecrf, genomics, etc.) | +| `therapeuticTags` | Disease/health areas (oncology, general-health, etc.) | +| `underlayName` | Data model identifier — use with `underlay_list_entities` for schema exploration | +| `dataDictionary` | Links to schema documentation | +| `usageExamples` | Sample use cases and SQL queries | +| `accessGroupName` | Access group required | +| `supportEmail` | Who to contact | +| `workbenchUrl` | Direct link to the collection in the Workbench UI | + +--- + +## Step 3 — Present Results and Offer to Refine + +Present matching collections in a clear summary. For each result, highlight the fields most relevant to the user's query. Example format: + +--- +**[Collection Name]** +- **Summary**: [shortDescription] +- **Data types**: [dataModalityTags] +- **Patients**: [patientCount] | **Time frame**: [timeFrame] | **Geography**: [geographicCoverage] +- **Access**: [availability] | Free: [isFree] | Instant: [isInstantlyAccessible] +- **View in Workbench**: [workbenchUrl] +--- + +After presenting results, ask: + +> "Do any of these match what you're looking for? Would you like to refine the search — for example, filter by data type, study size, or access level?" + +If the user wants deeper detail on a specific collection: +- Use `underlayName` with `mcp__wb__underlay_list_entities` to explore the data schema +- Reference `usageExamples` for sample queries +- Reference `dataDictionary` for table/field documentation + +--- + +## Step 4 — Add to Workspace + +If the user wants to use a data collection: + +1. Provide the direct link to the collection: + > "You can view and request access to **[Collection Name]** here: [workbenchUrl]" + +2. Instruct them to click **"Add to Workspace"** or **"Get Access"** in the Workbench UI. The button label depends on whether the collection is instantly accessible or requires approval. + +3. If the collection is instantly accessible (`isInstantlyAccessible: true`), tell them: + > "This collection is instantly accessible — once you click 'Add to Workspace', the resources will be available in your workspace immediately." + +4. If it requires approval (`isInstantlyAccessible: false`): + > "This collection requires access approval. After you submit the request at [workbenchUrl], access is typically granted after review." + +5. After the user confirms they've added the collection, use `workspace_list_data_collections` to confirm the resources are now visible in their workspace. + +--- + +## Notes + +- The platform-wide search may not return every data collection in Workbench — some collections use different workspace tags. If the user knows a specific collection exists but it's not in the results, direct them to browse the full catalog at: `https://workbench.verily.com/data-collections` +- `workspace_list_data_collections` only shows collections already attached to the active workspace +- `platform_list_data_collections` searches platform-wide but requires the user to have at least READ access to the collection workspace diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index fd3643060..b9c5fcb64 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -1780,13 +1780,19 @@ func handleCallTool(params CallToolParams) CallToolResult { continue } - id, _ := ws["id"].(string) + uuid, _ := ws["id"].(string) + userFacingId, _ := ws["userFacingId"].(string) name, _ := ws["displayName"].(string) if name == "" { - name, _ = ws["userFacingId"].(string) + name = userFacingId } desc, _ := ws["description"].(string) + // Derive the Workbench UI URL for this data collection + // workspaceBaseURL is e.g. https://workbench.verily.com/api/wsm + workbenchBaseUI := strings.Replace(workspaceBaseURL, "/api/wsm", "", 1) + collectionURL := fmt.Sprintf("%s/data-collections/%s", workbenchBaseUI, userFacingId) + // Extract all terra-* workspace properties into a flat map props := make(map[string]string) if propsArray, ok := ws["properties"].([]interface{}); ok { @@ -1800,7 +1806,8 @@ func handleCallTool(params CallToolParams) CallToolResult { } // Apply optional keyword filter across name, description, short description, - // modality tags, and therapeutic tags so searches like "genomics" or "imaging" work + // modality tags, and therapeutic tags so searches like "genomics" or "imaging" work. + // Props are extracted before the filter so tags are available for matching. if query != "" { searchTargets := strings.Join([]string{ strings.ToLower(name), @@ -1817,8 +1824,10 @@ func handleCallTool(params CallToolParams) CallToolResult { // Build structured result with all meaningful metadata fields dc := map[string]interface{}{ - "id": id, - "name": name, + "id": userFacingId, + "uuid": uuid, + "name": name, + "workbenchUrl": collectionURL, } // Overview From c9ccc5a6aeea742f8379de9581304b3f8b9aba93 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Mon, 11 May 2026 18:05:09 -0400 Subject: [PATCH 67/86] fix: tighten DATA_DISCOVERY skill triggers to platform-wide only Removes workspace-scoped trigger phrases to avoid conflicting with workspace_list_data_collections. Skill now only activates for cross-workspace / platform-wide discovery. Step 0 explicitly short-circuits to workspace_list_data_collections if user is asking about their active workspace. Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 11 ++++++----- .../src/llm-context/skills/DATA_DISCOVERY.md | 16 +++++----------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index d8210ee70..904027abb 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -2724,12 +2724,13 @@ Read these directly — no index needed: ### ⚡ Skill Trigger Guide -**Read \`DATA_DISCOVERY.md\` when:** -- "find data collections" / "search for data" / "what data do I have access to" -- "find me a dataset" / "search across workbench" / "what datasets are available" -- "genomics data" / "clinical data" / "imaging data" / "proteomics data" -- "data I can add to my workspace" / "data collections I haven't attached" +**Read \`DATA_DISCOVERY.md\` when the user wants to search ACROSS Workbench (not just their active workspace):** +- "find data collections" / "search for data across workbench" +- "find me a dataset" / "what datasets are available" +- "genomics data" / "clinical data" / "imaging data" / "proteomics data" (without a specific workspace context) +- "data I can add to my workspace" / "data collections I haven't attached yet" - "find data related to [disease / topic]" +- Do NOT use this skill for "what's in my workspace" — call \`workspace_list_data_collections\` directly instead **ALWAYS read \`DASHBOARD_BUILDER.md\` FIRST when user says ANY of these:** - "create a dashboard" diff --git a/features/src/llm-context/skills/DATA_DISCOVERY.md b/features/src/llm-context/skills/DATA_DISCOVERY.md index 5e2bea5ef..da1e79f83 100644 --- a/features/src/llm-context/skills/DATA_DISCOVERY.md +++ b/features/src/llm-context/skills/DATA_DISCOVERY.md @@ -6,13 +6,7 @@ Read this skill whenever the user asks about finding, searching, or exploring data collections — whether inside their active workspace or across all of Workbench. -**Trigger phrases — workspace-scoped search:** -- "What data is in my workspace?" -- "What data collections are attached to my workspace?" -- "Show me the resources in my workspace" -- "What datasets do I have access to here?" - -**Trigger phrases — platform-wide search:** +**Trigger phrases — platform-wide search only (read this skill):** - "Find data collections across Workbench" - "What data do I have access to?" - "Search for [disease / modality / population] datasets" @@ -26,14 +20,14 @@ Read this skill whenever the user asks about finding, searching, or exploring da ## Step 0 — Clarify the Search Scope -**Always ask the user first:** +**If the user's intent is ambiguous** (e.g., they said "find me data" without specifying where), ask: > "Would you like me to search only within your active workspace, or search across all data collections you have access to in Workbench (platform-wide)?" -- **Workspace-only**: Use `workspace_list_data_collections` — fast, shows only what's already attached -- **Platform-wide**: Use `platform_list_data_collections` — broader, searches all accessible collections +- **Workspace-only**: Call `workspace_list_data_collections` directly — no need to continue with this skill +- **Platform-wide**: Continue with Steps 1–4 below -Do not assume scope. If the user's intent is ambiguous, ask. +If the user clearly said "in my workspace" or asked about attached resources, skip this skill entirely and call `workspace_list_data_collections` directly. --- From 9a2fbbf77e9329ab6eedc50fa9c402aa1e677ee6 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Mon, 11 May 2026 18:09:05 -0400 Subject: [PATCH 68/86] fix: narrow DATA_DISCOVERY triggers to explicit platform-wide search only Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 13 +++++---- .../src/llm-context/skills/DATA_DISCOVERY.md | 27 ++++++++++++------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 904027abb..3c807a67a 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -2724,13 +2724,12 @@ Read these directly — no index needed: ### ⚡ Skill Trigger Guide -**Read \`DATA_DISCOVERY.md\` when the user wants to search ACROSS Workbench (not just their active workspace):** -- "find data collections" / "search for data across workbench" -- "find me a dataset" / "what datasets are available" -- "genomics data" / "clinical data" / "imaging data" / "proteomics data" (without a specific workspace context) -- "data I can add to my workspace" / "data collections I haven't attached yet" -- "find data related to [disease / topic]" -- Do NOT use this skill for "what's in my workspace" — call \`workspace_list_data_collections\` directly instead +**Read \`DATA_DISCOVERY.md\` ONLY when the user is searching for data collections they don't yet have, platform-wide:** +- "search all data collections I have access to" / "find data collections across Workbench" +- "what data collections can I add to my workspace?" / "data collections I haven't added yet" +- "find a data collection related to [topic / disease / modality]" +- "search across all Workbench data collections" / "what data collections are available on the platform?" +- Do NOT use this skill for workspace-scoped questions — call \`workspace_list_data_collections\` directly instead **ALWAYS read \`DASHBOARD_BUILDER.md\` FIRST when user says ANY of these:** - "create a dashboard" diff --git a/features/src/llm-context/skills/DATA_DISCOVERY.md b/features/src/llm-context/skills/DATA_DISCOVERY.md index da1e79f83..5691bc11c 100644 --- a/features/src/llm-context/skills/DATA_DISCOVERY.md +++ b/features/src/llm-context/skills/DATA_DISCOVERY.md @@ -4,17 +4,26 @@ ## When to Use This Skill -Read this skill whenever the user asks about finding, searching, or exploring data collections — whether inside their active workspace or across all of Workbench. +**Only read this skill when the user is explicitly searching for data collections they do not yet have in their workspace — across all of Workbench.** -**Trigger phrases — platform-wide search only (read this skill):** +Do NOT read this skill if the user is asking about data already in their workspace. In that case, call `workspace_list_data_collections` or `workspace_list_resources` directly. + +**Read this skill ONLY when the user says something like:** +- "Search all data collections I have access to" - "Find data collections across Workbench" -- "What data do I have access to?" -- "Search for [disease / modality / population] datasets" -- "Are there any genomics / clinical / imaging datasets I can use?" -- "Find me a dataset related to [topic]" -- "What data collections exist that I haven't added yet?" -- "Show me all accessible data" -- "Is there a [cancer / diabetes / cardiovascular] dataset available?" +- "What data collections can I add to my workspace?" +- "Are there any data collections I haven't added yet?" +- "Find a data collection related to [topic / disease / modality]" +- "Search across all Workbench data collections for [keyword]" +- "What data collections are available on the platform?" +- "Browse all accessible data collections" + +**Listing data collections in my workspace** — do NOT read this skill, call `workspace_list_data_collections` directly: +- "What data collections are in my workspace?" +- "What data is attached to my workspace?" +- "List the data collections I have" +- "What datasets do I have in this workspace?" +- "Show me the data collections in my workspace" --- From de7139eb213b01633baf19e89f4cb9f977e3fc42 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Mon, 11 May 2026 18:10:03 -0400 Subject: [PATCH 69/86] fix: remove overly broad note from DATA_DISCOVERY skill Co-authored-by: Cursor --- features/src/llm-context/skills/DATA_DISCOVERY.md | 1 - 1 file changed, 1 deletion(-) diff --git a/features/src/llm-context/skills/DATA_DISCOVERY.md b/features/src/llm-context/skills/DATA_DISCOVERY.md index 5691bc11c..d60fecea6 100644 --- a/features/src/llm-context/skills/DATA_DISCOVERY.md +++ b/features/src/llm-context/skills/DATA_DISCOVERY.md @@ -152,6 +152,5 @@ If the user wants to use a data collection: ## Notes -- The platform-wide search may not return every data collection in Workbench — some collections use different workspace tags. If the user knows a specific collection exists but it's not in the results, direct them to browse the full catalog at: `https://workbench.verily.com/data-collections` - `workspace_list_data_collections` only shows collections already attached to the active workspace - `platform_list_data_collections` searches platform-wide but requires the user to have at least READ access to the collection workspace From db9eace707f0c5c5d6ec7e67c92a53acf51aeebe Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Mon, 11 May 2026 20:26:41 -0400 Subject: [PATCH 70/86] fix: embed DATA_DISCOVERY.md as heredoc in generate-context.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cp approach relied on feature source files being present at runtime, which they are not inside the devcontainer. All other skills use embedded heredocs — align DATA_DISCOVERY.md with the same pattern so it is always written correctly at app startup. Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 165 ++++++++++++++++++- 1 file changed, 159 insertions(+), 6 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 3c807a67a..2b3b4d0d2 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -110,13 +110,166 @@ install_skills() { local cloud_platform="${1:-GCP}" log_info "Installing skill files..." - # Copy DATA_DISCOVERY.md skill from the feature source directory + # Create DATA_DISCOVERY.md skill (full version, embedded) log_info "Creating DATA_DISCOVERY.md skill..." - if [ -f "$(dirname "$0")/skills/DATA_DISCOVERY.md" ]; then - cp "$(dirname "$0")/skills/DATA_DISCOVERY.md" "${SKILLS_DIR}/DATA_DISCOVERY.md" - else - log_info "DATA_DISCOVERY.md not found in feature source, skipping" - fi + cat > "${SKILLS_DIR}/DATA_DISCOVERY.md" << 'DATA_DISCOVERY_EOF' +# Data Collection Discovery + +--- + +## When to Use This Skill + +**Only read this skill when the user is explicitly searching for data collections they do not yet have in their workspace — across all of Workbench.** + +Do NOT read this skill if the user is asking about data already in their workspace. In that case, call `workspace_list_data_collections` or `workspace_list_resources` directly. + +**Read this skill ONLY when the user says something like:** +- "Search all data collections I have access to" +- "Find data collections across Workbench" +- "What data collections can I add to my workspace?" +- "Are there any data collections I haven't added yet?" +- "Find a data collection related to [topic / disease / modality]" +- "Search across all Workbench data collections for [keyword]" +- "What data collections are available on the platform?" +- "Browse all accessible data collections" + +**Listing data collections in my workspace** — do NOT read this skill, call `workspace_list_data_collections` directly: +- "What data collections are in my workspace?" +- "What data is attached to my workspace?" +- "List the data collections I have" +- "What datasets do I have in this workspace?" +- "Show me the data collections in my workspace" + +--- + +## Step 0 — Clarify the Search Scope + +**If the user's intent is ambiguous** (e.g., they said "find me data" without specifying where), ask: + +> "Would you like me to search only within your active workspace, or search across all data collections you have access to in Workbench (platform-wide)?" + +- **Workspace-only**: Call `workspace_list_data_collections` directly — no need to continue with this skill +- **Platform-wide**: Continue with Steps 1–4 below + +If the user clearly said "in my workspace" or asked about attached resources, skip this skill entirely and call `workspace_list_data_collections` directly. + +--- + +## Step 1 — Clarify Search Criteria + +Before searching, confirm what the user is looking for: + +- **Topic / disease area** (e.g., oncology, cardiovascular, diabetes, general health) +- **Data modality** (e.g., genomics, imaging, lab results, patient-reported outcomes, EHR/EHR-derived) +- **Population** (e.g., age range, geography, study size) +- **Access type** (free vs. controlled access, instantly accessible vs. requires approval) +- **Data model** (e.g., standard underlay like AoU, custom schema) + +If the user has already provided enough context, proceed directly to Step 2. + +--- + +## Step 2 — Search + +### Platform-wide search (primary) + +Use the MCP tool first: + +``` +mcp__wb__platform_list_data_collections(query="") +``` + +- Pass the user's topic, modality, or disease area as `query` +- The tool searches across: name, description, modality tags, therapeutic tags, data model +- If no `query` is provided, it returns all accessible data collections + +If the MCP tool is unavailable, fall back to: +```bash +wb workspace list --format=json | jq '[.[] | select(.properties[]? | select(.key=="terra-type" and .value=="data-collection"))]' +``` + +### Workspace-scoped search + +``` +mcp__wb__workspace_list_data_collections() +``` + +### Search across all returned metadata + +For each result, the tool returns the following fields — use ALL of them when evaluating relevance: + +| Field | What it tells you | +|---|---| +| `name` | Collection name | +| `shortDescription` | One-line summary | +| `description` | Full overview including provenance and methodology | +| `organization` | Who owns the data | +| `availability` | Public open access / Public controlled access / Private | +| `isFree` | Whether access is free | +| `isInstantlyAccessible` | Whether access is immediate or requires approval | +| `patientCount` | Study size | +| `timeFrame` | Date range of data collection | +| `geographicCoverage` | Countries / regions | +| `dataModel` | Schema type (e.g., standard underlay, Non-standard custom) | +| `dataModalityTags` | Types of data (imaging, lab-results, ecrf, genomics, etc.) | +| `therapeuticTags` | Disease/health areas (oncology, general-health, etc.) | +| `underlayName` | Data model identifier — use with `underlay_list_entities` for schema exploration | +| `dataDictionary` | Links to schema documentation | +| `usageExamples` | Sample use cases and SQL queries | +| `accessGroupName` | Access group required | +| `supportEmail` | Who to contact | +| `workbenchUrl` | Direct link to the collection in the Workbench UI | + +--- + +## Step 3 — Present Results and Offer to Refine + +Present matching collections in a clear summary. For each result, highlight the fields most relevant to the user's query. Example format: + +--- +**[Collection Name]** +- **Summary**: [shortDescription] +- **Data types**: [dataModalityTags] +- **Patients**: [patientCount] | **Time frame**: [timeFrame] | **Geography**: [geographicCoverage] +- **Access**: [availability] | Free: [isFree] | Instant: [isInstantlyAccessible] +- **View in Workbench**: [workbenchUrl] +--- + +After presenting results, ask: + +> "Do any of these match what you're looking for? Would you like to refine the search — for example, filter by data type, study size, or access level?" + +If the user wants deeper detail on a specific collection: +- Use `underlayName` with `mcp__wb__underlay_list_entities` to explore the data schema +- Reference `usageExamples` for sample queries +- Reference `dataDictionary` for table/field documentation + +--- + +## Step 4 — Add to Workspace + +If the user wants to use a data collection: + +1. Provide the direct link to the collection: + > "You can view and request access to **[Collection Name]** here: [workbenchUrl]" + +2. Instruct them to click **"Add to Workspace"** or **"Get Access"** in the Workbench UI. The button label depends on whether the collection is instantly accessible or requires approval. + +3. If the collection is instantly accessible (`isInstantlyAccessible: true`), tell them: + > "This collection is instantly accessible — once you click 'Add to Workspace', the resources will be available in your workspace immediately." + +4. If it requires approval (`isInstantlyAccessible: false`): + > "This collection requires access approval. After you submit the request at [workbenchUrl], access is typically granted after review." + +5. After the user confirms they've added the collection, use `workspace_list_data_collections` to confirm the resources are now visible in their workspace. + +--- + +## Notes + +- `workspace_list_data_collections` only shows collections already attached to the active workspace +- `platform_list_data_collections` searches platform-wide but requires the user to have at least READ access to the collection workspace +DATA_DISCOVERY_EOF # Create CUSTOM_APP.md skill (full version, embedded) log_info "Creating CUSTOM_APP.md skill..." From d0c2ba9ad8fdbfd54320a8d9833e9be794ff3bfc Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Tue, 12 May 2026 10:39:10 -0400 Subject: [PATCH 71/86] feat: add relevance ranking to DATA_DISCOVERY skill and strengthen trigger - Step 3 now ranks each result 1-5 with a one-sentence justification, sorted highest first; all score labels are positively framed - CLAUDE.md trigger updated to ALWAYS read skill before calling platform_list_data_collections, with broader phrase coverage - Skill header reinforces that the MCP tool should not be called directly Co-authored-by: Cursor --- features/src/llm-context/generate-context.sh | 37 +++++++++++++------ .../src/llm-context/skills/DATA_DISCOVERY.md | 24 +++++++++--- 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 2b3b4d0d2..b955331a6 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -119,9 +119,9 @@ install_skills() { ## When to Use This Skill -**Only read this skill when the user is explicitly searching for data collections they do not yet have in their workspace — across all of Workbench.** +**Always read this skill before calling `platform_list_data_collections`.** This skill controls the full discovery flow — do not call the MCP tool directly without following these steps first. -Do NOT read this skill if the user is asking about data already in their workspace. In that case, call `workspace_list_data_collections` or `workspace_list_resources` directly. +Do NOT read this skill if the user is asking about data already in their workspace. In that case, call `workspace_list_data_collections` directly. **Read this skill ONLY when the user says something like:** - "Search all data collections I have access to" @@ -222,12 +222,24 @@ For each result, the tool returns the following fields — use ALL of them when --- -## Step 3 — Present Results and Offer to Refine +## Step 3 — Rank, Present Results, and Offer to Refine -Present matching collections in a clear summary. For each result, highlight the fields most relevant to the user's query. Example format: +For every result returned, assign a **relevance score from 1–5** based on how well the collection's metadata matches the user's query. Use ALL available metadata fields when scoring — name, description, shortDescription, dataModalityTags, therapeuticTags, dataModel, usageExamples, dataDictionary, patientCount, geographicCoverage. + +**Scoring guide:** +| Score | Meaning | +|---|---| +| ⭐⭐⭐⭐⭐ 5 | Exact match — directly contains the data type, gene, disease, or topic the user asked about | +| ⭐⭐⭐⭐ 4 | Strong match — highly relevant to the query and covers the right domain or modality | +| ⭐⭐⭐ 3 | Good match — related to the query's domain; may not be specific to the exact topic but offers valuable context | +| ⭐⭐ 2 | Potential match — shares topical overlap with the query and is worth exploring further | +| ⭐ 1 | Broad match — loosely connected to the query; included for completeness and may surface unexpected value | + +Present results **sorted by score (highest first)**. For each result, include a one-sentence justification for the score that explains concretely why it ranked that way. Example format: --- -**[Collection Name]** +**[Collection Name]** — ⭐⭐⭐⭐⭐ 5/5 +- **Why**: [One concrete sentence explaining what in the metadata drove this score — e.g. "Contains whole-genome sequencing data with BRCA1/BRCA2 variant calls across 10,000 patients."] - **Summary**: [shortDescription] - **Data types**: [dataModalityTags] - **Patients**: [patientCount] | **Time frame**: [timeFrame] | **Geography**: [geographicCoverage] @@ -237,7 +249,7 @@ Present matching collections in a clear summary. For each result, highlight the After presenting results, ask: -> "Do any of these match what you're looking for? Would you like to refine the search — for example, filter by data type, study size, or access level?" +> "Do any of these look useful? Would you like to refine the search or explore a specific collection in more detail?" If the user wants deeper detail on a specific collection: - Use `underlayName` with `mcp__wb__underlay_list_entities` to explore the data schema @@ -2877,11 +2889,14 @@ Read these directly — no index needed: ### ⚡ Skill Trigger Guide -**Read \`DATA_DISCOVERY.md\` ONLY when the user is searching for data collections they don't yet have, platform-wide:** -- "search all data collections I have access to" / "find data collections across Workbench" -- "what data collections can I add to my workspace?" / "data collections I haven't added yet" -- "find a data collection related to [topic / disease / modality]" -- "search across all Workbench data collections" / "what data collections are available on the platform?" +**ALWAYS read \`DATA_DISCOVERY.md\` BEFORE calling \`platform_list_data_collections\`.** The skill controls the full discovery flow including scope clarification, result presentation, and how to add a collection to the workspace. + +Trigger \`DATA_DISCOVERY.md\` whenever the user is searching for data collections platform-wide: +- "find data collections" / "search for data collections" / "find data collections with [keyword]" +- "find data collections across Workbench" / "search all data collections I have access to" +- "what data collections can I add?" / "data collections I haven't added yet" +- "find a data collection related to [topic / disease / gene / modality]" +- "are there data collections about [topic]?" / "find data collections that have [keyword]" - Do NOT use this skill for workspace-scoped questions — call \`workspace_list_data_collections\` directly instead **ALWAYS read \`DASHBOARD_BUILDER.md\` FIRST when user says ANY of these:** diff --git a/features/src/llm-context/skills/DATA_DISCOVERY.md b/features/src/llm-context/skills/DATA_DISCOVERY.md index d60fecea6..aae6f1cbe 100644 --- a/features/src/llm-context/skills/DATA_DISCOVERY.md +++ b/features/src/llm-context/skills/DATA_DISCOVERY.md @@ -4,9 +4,9 @@ ## When to Use This Skill -**Only read this skill when the user is explicitly searching for data collections they do not yet have in their workspace — across all of Workbench.** +**Always read this skill before calling `platform_list_data_collections`.** This skill controls the full discovery flow — do not call the MCP tool directly without following these steps first. -Do NOT read this skill if the user is asking about data already in their workspace. In that case, call `workspace_list_data_collections` or `workspace_list_resources` directly. +Do NOT read this skill if the user is asking about data already in their workspace. In that case, call `workspace_list_data_collections` directly. **Read this skill ONLY when the user says something like:** - "Search all data collections I have access to" @@ -107,12 +107,24 @@ For each result, the tool returns the following fields — use ALL of them when --- -## Step 3 — Present Results and Offer to Refine +## Step 3 — Rank, Present Results, and Offer to Refine -Present matching collections in a clear summary. For each result, highlight the fields most relevant to the user's query. Example format: +For every result returned, assign a **relevance score from 1–5** based on how well the collection's metadata matches the user's query. Use ALL available metadata fields when scoring — name, description, shortDescription, dataModalityTags, therapeuticTags, dataModel, usageExamples, dataDictionary, patientCount, geographicCoverage. + +**Scoring guide:** +| Score | Meaning | +|---|---| +| ⭐⭐⭐⭐⭐ 5 | Exact match — directly contains the data type, gene, disease, or topic the user asked about | +| ⭐⭐⭐⭐ 4 | Strong match — highly relevant to the query and covers the right domain or modality | +| ⭐⭐⭐ 3 | Good match — related to the query's domain; may not be specific to the exact topic but offers valuable context | +| ⭐⭐ 2 | Potential match — shares topical overlap with the query and is worth exploring further | +| ⭐ 1 | Broad match — loosely connected to the query; included for completeness and may surface unexpected value | + +Present results **sorted by score (highest first)**. For each result, include a one-sentence justification for the score that explains concretely why it ranked that way. Example format: --- -**[Collection Name]** +**[Collection Name]** — ⭐⭐⭐⭐⭐ 5/5 +- **Why**: [One concrete sentence explaining what in the metadata drove this score — e.g. "Contains whole-genome sequencing data with BRCA1/BRCA2 variant calls across 10,000 patients."] - **Summary**: [shortDescription] - **Data types**: [dataModalityTags] - **Patients**: [patientCount] | **Time frame**: [timeFrame] | **Geography**: [geographicCoverage] @@ -122,7 +134,7 @@ Present matching collections in a clear summary. For each result, highlight the After presenting results, ask: -> "Do any of these match what you're looking for? Would you like to refine the search — for example, filter by data type, study size, or access level?" +> "Do any of these look useful? Would you like to refine the search or explore a specific collection in more detail?" If the user wants deeper detail on a specific collection: - Use `underlayName` with `mcp__wb__underlay_list_entities` to explore the data schema From 5f9e6191ff28c80ca2af01025c223a0732cdab76 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 13 May 2026 11:56:36 -0400 Subject: [PATCH 72/86] fix(mcp): replace N+1 API calls with single batch lookup in workspace_list_data_collections The tool was making one sequential API call per data collection to resolve its display name, causing timeouts on workspaces with 5+ collections. Fix: one batch POST to /api/workspaces/v2/filtered builds a uuid->name map upfront. Resources are then grouped by display name in memory. Falls back to UUID as group key if the batch call fails. Co-authored-by: Cursor --- features/src/wb-mcp-server/main.go | 85 +++++++++++++----------------- 1 file changed, 37 insertions(+), 48 deletions(-) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index b9c5fcb64..2eb712d88 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -2726,54 +2726,40 @@ func handleCallTool(params CallToolParams) CallToolResult { resourcesList = []interface{}{} } - // Extract sourceWorkspaceIds from resourceLineage (which is an ARRAY inside metadata) - sourceWorkspaceIds := make(map[string]bool) - for _, r := range resourcesList { - resource, ok := r.(map[string]interface{}) - if !ok { - continue - } - metadata, ok := resource["metadata"].(map[string]interface{}) - if !ok { - continue - } - // resourceLineage is an array inside metadata - if lineageArray, ok := metadata["resourceLineage"].([]interface{}); ok && len(lineageArray) > 0 { - if firstLineage, ok := lineageArray[0].(map[string]interface{}); ok { - if sourceId, ok := firstLineage["sourceWorkspaceId"].(string); ok && sourceId != "" { - sourceWorkspaceIds[sourceId] = true - } - } - } + // Build a UUID → display name map with a single batch API call. + // This avoids the N+1 sequential lookups (one per collection) that caused timeouts. + collectionNames := make(map[string]string) // uuid → display name + batchBody := map[string]interface{}{ + "limit": 1000, + "offset": 0, + "properties": []map[string]string{ + {"key": "terra-type", "value": "data-collection"}, + }, } - - // Look up each source workspace to get the data collection name - dataCollectionNames := make(map[string]string) // sourceWorkspaceId -> display name - for sourceId := range sourceWorkspaceIds { - // Use API to get workspace details - wsUrl := fmt.Sprintf("%s/api/workspaces/v1/%s", workspaceBaseURL, sourceId) - wsResp, wsErr := makeAPIRequest("GET", wsUrl, nil) - if wsErr == nil { - var wsInfo map[string]interface{} - if json.Unmarshal(wsResp, &wsInfo) == nil { - // Try to get display name, fall back to id - if displayName, ok := wsInfo["displayName"].(string); ok && displayName != "" { - dataCollectionNames[sourceId] = displayName - } else if userFacingId, ok := wsInfo["userFacingId"].(string); ok && userFacingId != "" { - dataCollectionNames[sourceId] = userFacingId - } else { - dataCollectionNames[sourceId] = sourceId + if batchResp, batchErr := makeAPIRequest("POST", workspaceBaseURL+"/api/workspaces/v2/filtered", batchBody); batchErr == nil { + var batchData map[string]interface{} + if json.Unmarshal(batchResp, &batchData) == nil { + if wsList, ok := batchData["workspaces"].([]interface{}); ok { + for _, w := range wsList { + ws, ok := w.(map[string]interface{}) + if !ok { + continue + } + uuid, _ := ws["id"].(string) + displayName, _ := ws["displayName"].(string) + if displayName == "" { + displayName, _ = ws["userFacingId"].(string) + } + if uuid != "" && displayName != "" { + collectionNames[uuid] = displayName + } } - } else { - dataCollectionNames[sourceId] = sourceId } - } else { - // If we can't access the source workspace, use the ID - dataCollectionNames[sourceId] = sourceId + " (inaccessible)" } } + // Fall back gracefully: if the batch call fails, groups will be keyed by UUID - // Group resources by data collection (using resourceLineage array inside metadata) + // Group resources by data collection, using display name where available dataCollections := make(map[string]map[string]interface{}) localResources := []map[string]interface{}{} @@ -2819,17 +2805,20 @@ func handleCallTool(params CallToolParams) CallToolResult { } } - // Group by data collection or mark as local + // Group by display name (falling back to UUID if name not resolved) if sourceId != "" { - collectionName := dataCollectionNames[sourceId] - if dataCollections[collectionName] == nil { - dataCollections[collectionName] = map[string]interface{}{ + groupKey := collectionNames[sourceId] + if groupKey == "" { + groupKey = sourceId + } + if dataCollections[groupKey] == nil { + dataCollections[groupKey] = map[string]interface{}{ "sourceWorkspaceId": sourceId, "resources": []map[string]interface{}{}, } } - resList := dataCollections[collectionName]["resources"].([]map[string]interface{}) - dataCollections[collectionName]["resources"] = append(resList, resourceInfo) + resList := dataCollections[groupKey]["resources"].([]map[string]interface{}) + dataCollections[groupKey]["resources"] = append(resList, resourceInfo) } else { localResources = append(localResources, resourceInfo) } From ff1a300d88979403ed72d77e0f0a3d5f6312908b Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Wed, 13 May 2026 21:25:08 -0400 Subject: [PATCH 73/86] fix(mcp): cache workspace UUID at startup to eliminate runtime resolveWorkspaceId calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit workspace_list_data_collections was calling wb status (CLI subprocess) and resolveWorkspaceId (fetches all 5000 workspaces) on every invocation, adding 2-3 expensive operations before any real work. Fix: cache the workspace UUID once in initializeConfig() from the wb status output already fetched at startup. workspace_list_data_collections now starts directly with the resources API call — 2 total calls instead of 4+. Co-authored-by: Cursor --- features/src/wb-mcp-server/main.go | 67 +++++++++++------------------- 1 file changed, 25 insertions(+), 42 deletions(-) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index 2eb712d88..00763a420 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -90,9 +90,10 @@ type ContentItem struct { // Global variables var ( - workspaceBaseURL string - dataExplorerURL string - httpClient = &http.Client{Timeout: 60 * time.Second} + workspaceBaseURL string + dataExplorerURL string + cachedWorkspaceUUID string // populated once at startup from wb status + httpClient = &http.Client{Timeout: 60 * time.Second} ) // Tool definitions @@ -1584,16 +1585,23 @@ func initializeConfig() error { var status map[string]interface{} if err := json.Unmarshal(output, &status); err != nil { fmt.Fprintf(os.Stderr, "Warning: failed to parse wb status JSON, using default URLs: %v\n", err) - } else if server, ok := status["server"].(map[string]interface{}); ok { - // Get workspaceManagerUri from wb status output - if wsURL, ok := server["workspaceManagerUri"].(string); ok && wsURL != "" { - workspaceBaseURL = wsURL - // Derive dataExplorerUri from workspaceManagerUri - // Pattern: replace /api/wsm with /api/de - dataExplorerURL = strings.Replace(wsURL, "/api/wsm", "/api/de", 1) - } } else { - fmt.Fprintf(os.Stderr, "Warning: server info not found in wb status, using default URLs\n") + // Extract server URLs + if server, ok := status["server"].(map[string]interface{}); ok { + if wsURL, ok := server["workspaceManagerUri"].(string); ok && wsURL != "" { + workspaceBaseURL = wsURL + dataExplorerURL = strings.Replace(wsURL, "/api/wsm", "/api/de", 1) + } + } else { + fmt.Fprintf(os.Stderr, "Warning: server info not found in wb status, using default URLs\n") + } + // Cache the current workspace UUID to avoid resolveWorkspaceId calls at runtime + if ws, ok := status["workspace"].(map[string]interface{}); ok { + if uuid, ok := ws["id"].(string); ok && uuid != "" { + cachedWorkspaceUUID = uuid + fmt.Fprintf(os.Stderr, "Cached workspace UUID: %s\n", uuid) + } + } } } @@ -2674,38 +2682,13 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand([]string{"folder", "tree"}) case "workspace_list_data_collections": - // Get current workspace from wb status - statusOutput, statusErr := executeWbCommand([]string{"status", "--format=json"}) - if statusErr != nil { - err = fmt.Errorf("failed to get workspace status: %w", statusErr) - break - } - var statusData map[string]interface{} - if jsonErr := json.Unmarshal([]byte(statusOutput), &statusData); jsonErr != nil { - err = fmt.Errorf("failed to parse status: %w", jsonErr) - break - } - workspace, ok := statusData["workspace"].(map[string]interface{}) - if !ok { - err = fmt.Errorf("no workspace set - run 'wb workspace set ' first") - break - } - // Get either userFacingId or id from the workspace status - workspaceId := "" - if ufid, ok := workspace["userFacingId"].(string); ok && ufid != "" { - workspaceId = ufid - } else if id, ok := workspace["id"].(string); ok { - workspaceId = id - } else { - err = fmt.Errorf("could not get workspace ID from status") - break - } - // Resolve to UUID using the same method as other working tools - workspaceUuid, resolveErr := resolveWorkspaceId(workspaceId) - if resolveErr != nil { - err = fmt.Errorf("could not resolve workspace ID: %w", resolveErr) + // Use the workspace UUID cached at startup — avoids calling wb status and + // resolveWorkspaceId (which fetches all 5000 workspaces) on every tool call. + if cachedWorkspaceUUID == "" { + err = fmt.Errorf("workspace UUID not available — MCP server may not have a workspace set at startup") break } + workspaceUuid := cachedWorkspaceUUID // List all resources (same API call as workspace_list_resources which works) resourcesUrl := fmt.Sprintf("%s/api/workspaces/v1/%s/resources?offset=0&limit=1000", workspaceBaseURL, workspaceUuid) From b64d5c02522913a24ab104e37eab5e76971dd40d Mon Sep 17 00:00:00 2001 From: David Shen Date: Wed, 13 May 2026 14:27:03 -0400 Subject: [PATCH 74/86] Update devcontainer to match jupyter app --- .../.devcontainer.json | 29 ++++++++++--------- src/workbench-jupyter-with-llm/Dockerfile | 2 +- .../devcontainer-template.json | 6 ++-- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json index 2aafedcb3..66e653a28 100644 --- a/src/workbench-jupyter-with-llm/.devcontainer.json +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -1,33 +1,34 @@ { - "name": "Navid Workbench Jupyter", + "name": "Workbench Jupyter with LLM tools", "dockerComposeFile": ["docker-compose.yaml", "../jupyter-common/jupyter-common-compose.yaml"], "service": "app", "runServices": ["app"], "shutdownAction": "none", "workspaceFolder": "/workspace", "postCreateCommand": [ - "bash", - "-c", - "./startupscript/post-startup.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\" && echo '' >> /home/jupyter/.bashrc && echo '# Unset GOOGLE_CLOUD_PROJECT for Gemini CLI compatibility' >> /home/jupyter/.bashrc && echo 'unset GOOGLE_CLOUD_PROJECT' >> /home/jupyter/.bashrc" + "./startupscript/post-startup.sh", + "jupyter", + "/home/jupyter", + "${templateOption:cloud}", + "${templateOption:login}" ], // re-mount bucket files on container start up, then generate LLM context - "postStartCommand": [ - "bash", - "-c", - "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\"; /opt/llm-context/generate-context.sh /home/jupyter || true" - ], + "postStartCommand": "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\"; /opt/llm-context/generate-context.sh /home/jupyter || true", "features": { + "ghcr.io/devcontainers/features/node@sha256:8c0de46939b61958041700ee89e3493f3b2e4131a06dc46b4d9423427d06e5f6": { + "version": "24.11.0" + }, + "ghcr.io/anthropics/devcontainer-features/claude-code@sha256:cfc2e7d3e9fd3b9b01f8d5cb158508a884c8c0ede2e23ed10f32dea5d4ffe69a": {}, + "./.devcontainer/features/gemini-cli": { "username": "jupyter" }, "./.devcontainer/features/workbench-tools": { + "libEnv": "/opt/conda/envs/jupyter", // Use the jupyter conda environment "cloud": "${templateOption:cloud}", "username": "jupyter", "userHomeDir": "/home/jupyter" }, - "./.devcontainer/features/gemini": { - "version": "latest", - "username": "jupyter", - "userHomeDir": "/home/jupyter" + "./.devcontainer/features/postgres-client": { + "version": "16" }, - "ghcr.io/anthropics/devcontainer-features/claude-code:1.0": {}, "./.devcontainer/features/wb-mcp-server": { "username": "jupyter", "userHomeDir": "/home/jupyter" diff --git a/src/workbench-jupyter-with-llm/Dockerfile b/src/workbench-jupyter-with-llm/Dockerfile index bc736fac9..3db1ef60b 100644 --- a/src/workbench-jupyter-with-llm/Dockerfile +++ b/src/workbench-jupyter-with-llm/Dockerfile @@ -1,4 +1,4 @@ -FROM us-west2-docker.pkg.dev/shared-pub-buckets-94mvrf/workbench-artifacts/app-workbench-jupyter@sha256:8261521e5433b6997c4b323c4b391b02ea3fc3f059e33ccedc36af2554ef70f9 +FROM us-west2-docker.pkg.dev/shared-pub-buckets-94mvrf/workbench-artifacts/app-workbench-jupyter@sha256:62089d6cef2015e08361928c6bb6ae003afd0800a3e682a536171b3bcb0765b1 # Install jupyter extensions RUN --mount=type=bind,from=jupyter-extension-builder,source=/dist,target=/tmp/extensions \ diff --git a/src/workbench-jupyter-with-llm/devcontainer-template.json b/src/workbench-jupyter-with-llm/devcontainer-template.json index cfe1e7864..02ee914fe 100644 --- a/src/workbench-jupyter-with-llm/devcontainer-template.json +++ b/src/workbench-jupyter-with-llm/devcontainer-template.json @@ -1,9 +1,9 @@ { - "id": "navid-workbench-jupyter", + "id": "workbench-jupyter-with-llm", "description": "Workbench JupyterLab with Gemini, Claude CLI, and MCP server integration", "version": "0.0.1", - "name": "Navid Workbench Jupyter", - "documentationURL": "https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/navid-workbench-jupyter", + "name": "Workbench Jupyter with LLM tools", + "documentationURL": "https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/workbench-jupyter-with-llm", "licenseURL": "https://github.com/verily-src/workbench-app-devcontainers/blob/master/LICENSE", "options": { "cloud": { From c8937c9edb031f1525c18d2feb65b742b0cb7c23 Mon Sep 17 00:00:00 2001 From: David Shen Date: Wed, 13 May 2026 14:36:31 -0400 Subject: [PATCH 75/86] shellcheck --- features/src/llm-context/generate-context.sh | 44 +++++++++++--------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index b955331a6..4a5c87edb 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -1,4 +1,5 @@ #!/bin/bash +# shellcheck disable=SC2016 # Single-quoted strings with $ and backticks are intentional template text # # Workbench LLM Context Generator # @@ -2353,8 +2354,10 @@ generate_bucket_list() { local cloud_platform="${2:-GCP}" if [ "$cloud_platform" = "AWS" ]; then - local buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "AWS_S3_STORAGE_FOLDER")]' 2>/dev/null || echo "[]") - local count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") + local buckets + buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "AWS_S3_STORAGE_FOLDER")]' 2>/dev/null || echo "[]") + local count + count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") if [ "$count" -eq 0 ] || [ "$count" = "0" ]; then echo "*No S3 buckets in this workspace.* Create one with:" @@ -2369,8 +2372,10 @@ generate_bucket_list() { echo "$buckets" | jq -r '.[] | "| `s3://\(.bucketName // "unknown")/\(.prefix // "")` | `\(.id // "—")` | \(.description // "—" | if . == "" then "—" else . end) |"' 2>/dev/null || true else # GCP - local buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "GCS_BUCKET")]' 2>/dev/null || echo "[]") - local count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") + local buckets + buckets=$(echo "$resources" | jq '[.[] | select(.resourceType == "GCS_BUCKET")]' 2>/dev/null || echo "[]") + local count + count=$(echo "$buckets" | jq 'length' 2>/dev/null || echo "0") if [ "$count" -eq 0 ] || [ "$count" = "0" ]; then echo "*No GCS buckets in this workspace.* Create one with:" @@ -2392,20 +2397,20 @@ generate_claude_md() { local workspace="$1" local resources="$2" - local workflows="$3" - local apps="$4" - + # $3 (workflows) and $4 (apps) reserved for future use + # Extract workspace values - field names match UFWorkspaceLight.java - local ws_name=$(echo "$workspace" | jq -r '.name // "Unnamed Workspace"') - local ws_id=$(echo "$workspace" | jq -r '.id // "unknown"') - local ws_desc=$(echo "$workspace" | jq -r '.description // "No description"') - local ws_cloud=$(echo "$workspace" | jq -r '.cloudPlatform // "GCP"') - local ws_gcp_project=$(echo "$workspace" | jq -r '.googleProjectId // ""') - local ws_aws_account=$(echo "$workspace" | jq -r '.awsAccountId // ""') - local ws_role=$(echo "$workspace" | jq -r '.highestRole // "READER"') - local ws_user=$(echo "$workspace" | jq -r '.userEmail // "unknown"') - local ws_org=$(echo "$workspace" | jq -r '.orgId // ""') - local ws_server=$(echo "$workspace" | jq -r '.serverName // ""') + local ws_name ws_id ws_desc ws_cloud ws_gcp_project ws_aws_account ws_role ws_user ws_org ws_server + ws_name=$(echo "$workspace" | jq -r '.name // "Unnamed Workspace"') + ws_id=$(echo "$workspace" | jq -r '.id // "unknown"') + ws_desc=$(echo "$workspace" | jq -r '.description // "No description"') + ws_cloud=$(echo "$workspace" | jq -r '.cloudPlatform // "GCP"') + ws_gcp_project=$(echo "$workspace" | jq -r '.googleProjectId // ""') + ws_aws_account=$(echo "$workspace" | jq -r '.awsAccountId // ""') + ws_role=$(echo "$workspace" | jq -r '.highestRole // "READER"') + ws_user=$(echo "$workspace" | jq -r '.userEmail // "unknown"') + ws_org=$(echo "$workspace" | jq -r '.orgId // ""') + ws_server=$(echo "$workspace" | jq -r '.serverName // ""') # Determine project display local project_display="$ws_gcp_project" @@ -2596,8 +2601,9 @@ wb resource add-ref gcs-bucket --name external-data --bucket-name existing-bucke fi # Generate dynamic sections - local embedded_json=$(generate_embedded_json "$resources") - local bucket_list=$(generate_bucket_list "$resources" "$ws_cloud") + local embedded_json bucket_list + embedded_json=$(generate_embedded_json "$resources") + bucket_list=$(generate_bucket_list "$resources" "$ws_cloud") # Write the file cat > "${CLAUDE_FILE}" << EOF From 0047eda944f363602ac558440069e9c3898baa72 Mon Sep 17 00:00:00 2001 From: David Shen Date: Wed, 13 May 2026 14:39:14 -0400 Subject: [PATCH 76/86] Tests --- .github/workflows/test-pr.yaml | 9 +++++++++ tests/common/llm-tools.bats | 23 +++++++++++++++++++++++ tests/workbench-jupyter-with-llm.sh | 8 ++++++++ 3 files changed, 40 insertions(+) create mode 100644 tests/common/llm-tools.bats create mode 100755 tests/workbench-jupyter-with-llm.sh diff --git a/.github/workflows/test-pr.yaml b/.github/workflows/test-pr.yaml index 7790bcc65..72a9a7a89 100644 --- a/.github/workflows/test-pr.yaml +++ b/.github/workflows/test-pr.yaml @@ -75,6 +75,15 @@ jobs: - 'features/src/postgres-client/**' - 'src/aou-common/**' - 'src/nemo_jupyter/**' + workbench-jupyter-with-llm: + maximize_build_space: true + filters: + - 'features/src/workbench-tools/**' + - 'features/src/postgres-client/**' + - 'features/src/llm-context/**' + - 'features/src/wb-mcp-server/**' + - 'features/src/gemini-cli/**' + - 'src/jupyter-common/**' workbench-jupyter-parabricks: maximize_build_space: true filters: diff --git a/tests/common/llm-tools.bats b/tests/common/llm-tools.bats new file mode 100644 index 000000000..7f170cd4b --- /dev/null +++ b/tests/common/llm-tools.bats @@ -0,0 +1,23 @@ +setup_file() { + echo "# Running ${BATS_TEST_FILENAME##*/}" >&3 +} + +setup() { + load common +} + +@test "node" { + run_in_container node --version +} + +@test "npm" { + run_in_container npm --version +} + +@test "claude" { + run_in_container claude --version +} + +@test "gemini" { + run_in_container gemini --version +} diff --git a/tests/workbench-jupyter-with-llm.sh b/tests/workbench-jupyter-with-llm.sh new file mode 100755 index 000000000..0e8d7e6b5 --- /dev/null +++ b/tests/workbench-jupyter-with-llm.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -o errexit +export TEST_USER="jupyter" + +bats tests/common/base.bats +bats tests/common/workbench-tools.bats +bats tests/common/postgres-client.bats +bats tests/common/llm-tools.bats From e191024d1a1a66c7437457a920966dd1188a6a7c Mon Sep 17 00:00:00 2001 From: David Shen Date: Wed, 13 May 2026 15:30:47 -0400 Subject: [PATCH 77/86] fix: harden MCP server, deduplicate skills, fix docs and install scripts - Add requireString/requireStrings helpers to MCP server to prevent panics from unchecked type assertions on missing tool arguments - Extract ~2,150 lines of embedded skill heredocs from generate-context.sh into standalone files copied at install time, establishing skills/ as the single source of truth - Create standalone AWS skill variants (WORKFLOW_TROUBLESHOOT, DASHBOARD_BUILDER) previously only available as heredocs - Merge heredoc-unique content into standalone skills (Quick Start in CUSTOM_APP, "Be Proactive" behavior in WORKFLOW_TROUBLESHOOT) - Replace personal repo reference in APP_TEMPLATES with org repo and add fork guidance - Detect architecture dynamically in wb-mcp-server install (amd64/arm64) - Bump Go from 1.21 to 1.25 to match rest of repo - Fix path references in llm-context README (~/.workbench -> ~/.claude) - Fix app README listing non-existent template options - Add .bashrc idempotency guards to both install scripts - Fix cp -r reinstall nesting issue in llm-context install Co-Authored-By: Claude Opus 4.6 (1M context) --- features/src/llm-context/README.md | 6 +- features/src/llm-context/generate-context.sh | 2205 +---------------- features/src/llm-context/install.sh | 35 +- .../src/llm-context/skills/APP_TEMPLATES.md | 41 +- features/src/llm-context/skills/CUSTOM_APP.md | 31 + .../skills/WORKFLOW_TROUBLESHOOT.md | 15 +- .../skills/aws/DASHBOARD_BUILDER.md | 393 +++ .../skills/aws/WORKFLOW_TROUBLESHOOT.md | 300 +++ features/src/wb-mcp-server/README.md | 2 +- features/src/wb-mcp-server/go.mod | 2 +- features/src/wb-mcp-server/install.sh | 29 +- features/src/wb-mcp-server/main.go | 380 ++- src/workbench-jupyter-with-llm/README.md | 2 - 13 files changed, 1100 insertions(+), 2341 deletions(-) create mode 100644 features/src/llm-context/skills/aws/DASHBOARD_BUILDER.md create mode 100644 features/src/llm-context/skills/aws/WORKFLOW_TROUBLESHOOT.md diff --git a/features/src/llm-context/README.md b/features/src/llm-context/README.md index 2c04a695b..84d4bac9c 100644 --- a/features/src/llm-context/README.md +++ b/features/src/llm-context/README.md @@ -37,7 +37,7 @@ When installed, this feature: 1. **Generates `~/CLAUDE.md`** - Claude Code auto-discovers this file on startup 2. **Provides workspace context** - Name, ID, role, resources, cloud paths -3. **Includes skill files** - Detailed guides (e.g., custom app creation) in `~/.workbench/skills/` +3. **Includes skill files** - Detailed guides (e.g., custom app creation) in `~/.claude/skills/` 4. **Sets up aliases** - `generate-llm-context`, `refresh-context` ## What's in `~/CLAUDE.md` @@ -116,9 +116,9 @@ head ~/CLAUDE.md |------|---------| | `/opt/llm-context/generate-context.sh` | Main generation script | | `/opt/llm-context/run-context-generator.sh` | Auto-run wrapper | -| `~/.workbench/CLAUDE.md` | Generated context (primary) | +| `~/.claude/CLAUDE.md` | Generated context (primary) | | `~/CLAUDE.md` | Symlink for auto-discovery | -| `~/.workbench/skills/` | Skill files (e.g., CUSTOM_APP.md) | +| `~/.claude/skills/` | Skill files (e.g., CUSTOM_APP.md) | ## Notes diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index 4a5c87edb..d2b69619d 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -105,2194 +105,41 @@ setup_directories() { mkdir -p "${SKILLS_DIR}" } -# Install skill files (embedded - no network needed) +# Install skill files from /opt/llm-context/skills/ (copied at install time) # $1: cloud_platform — "GCP" (default) or "AWS" install_skills() { local cloud_platform="${1:-GCP}" + local source_skills="/opt/llm-context/skills" log_info "Installing skill files..." - - # Create DATA_DISCOVERY.md skill (full version, embedded) - log_info "Creating DATA_DISCOVERY.md skill..." - cat > "${SKILLS_DIR}/DATA_DISCOVERY.md" << 'DATA_DISCOVERY_EOF' -# Data Collection Discovery - ---- - -## When to Use This Skill - -**Always read this skill before calling `platform_list_data_collections`.** This skill controls the full discovery flow — do not call the MCP tool directly without following these steps first. - -Do NOT read this skill if the user is asking about data already in their workspace. In that case, call `workspace_list_data_collections` directly. - -**Read this skill ONLY when the user says something like:** -- "Search all data collections I have access to" -- "Find data collections across Workbench" -- "What data collections can I add to my workspace?" -- "Are there any data collections I haven't added yet?" -- "Find a data collection related to [topic / disease / modality]" -- "Search across all Workbench data collections for [keyword]" -- "What data collections are available on the platform?" -- "Browse all accessible data collections" - -**Listing data collections in my workspace** — do NOT read this skill, call `workspace_list_data_collections` directly: -- "What data collections are in my workspace?" -- "What data is attached to my workspace?" -- "List the data collections I have" -- "What datasets do I have in this workspace?" -- "Show me the data collections in my workspace" - ---- - -## Step 0 — Clarify the Search Scope - -**If the user's intent is ambiguous** (e.g., they said "find me data" without specifying where), ask: - -> "Would you like me to search only within your active workspace, or search across all data collections you have access to in Workbench (platform-wide)?" - -- **Workspace-only**: Call `workspace_list_data_collections` directly — no need to continue with this skill -- **Platform-wide**: Continue with Steps 1–4 below - -If the user clearly said "in my workspace" or asked about attached resources, skip this skill entirely and call `workspace_list_data_collections` directly. - ---- - -## Step 1 — Clarify Search Criteria - -Before searching, confirm what the user is looking for: - -- **Topic / disease area** (e.g., oncology, cardiovascular, diabetes, general health) -- **Data modality** (e.g., genomics, imaging, lab results, patient-reported outcomes, EHR/EHR-derived) -- **Population** (e.g., age range, geography, study size) -- **Access type** (free vs. controlled access, instantly accessible vs. requires approval) -- **Data model** (e.g., standard underlay like AoU, custom schema) - -If the user has already provided enough context, proceed directly to Step 2. - ---- - -## Step 2 — Search - -### Platform-wide search (primary) - -Use the MCP tool first: - -``` -mcp__wb__platform_list_data_collections(query="") -``` - -- Pass the user's topic, modality, or disease area as `query` -- The tool searches across: name, description, modality tags, therapeutic tags, data model -- If no `query` is provided, it returns all accessible data collections - -If the MCP tool is unavailable, fall back to: -```bash -wb workspace list --format=json | jq '[.[] | select(.properties[]? | select(.key=="terra-type" and .value=="data-collection"))]' -``` - -### Workspace-scoped search - -``` -mcp__wb__workspace_list_data_collections() -``` - -### Search across all returned metadata - -For each result, the tool returns the following fields — use ALL of them when evaluating relevance: - -| Field | What it tells you | -|---|---| -| `name` | Collection name | -| `shortDescription` | One-line summary | -| `description` | Full overview including provenance and methodology | -| `organization` | Who owns the data | -| `availability` | Public open access / Public controlled access / Private | -| `isFree` | Whether access is free | -| `isInstantlyAccessible` | Whether access is immediate or requires approval | -| `patientCount` | Study size | -| `timeFrame` | Date range of data collection | -| `geographicCoverage` | Countries / regions | -| `dataModel` | Schema type (e.g., standard underlay, Non-standard custom) | -| `dataModalityTags` | Types of data (imaging, lab-results, ecrf, genomics, etc.) | -| `therapeuticTags` | Disease/health areas (oncology, general-health, etc.) | -| `underlayName` | Data model identifier — use with `underlay_list_entities` for schema exploration | -| `dataDictionary` | Links to schema documentation | -| `usageExamples` | Sample use cases and SQL queries | -| `accessGroupName` | Access group required | -| `supportEmail` | Who to contact | -| `workbenchUrl` | Direct link to the collection in the Workbench UI | - ---- - -## Step 3 — Rank, Present Results, and Offer to Refine - -For every result returned, assign a **relevance score from 1–5** based on how well the collection's metadata matches the user's query. Use ALL available metadata fields when scoring — name, description, shortDescription, dataModalityTags, therapeuticTags, dataModel, usageExamples, dataDictionary, patientCount, geographicCoverage. - -**Scoring guide:** -| Score | Meaning | -|---|---| -| ⭐⭐⭐⭐⭐ 5 | Exact match — directly contains the data type, gene, disease, or topic the user asked about | -| ⭐⭐⭐⭐ 4 | Strong match — highly relevant to the query and covers the right domain or modality | -| ⭐⭐⭐ 3 | Good match — related to the query's domain; may not be specific to the exact topic but offers valuable context | -| ⭐⭐ 2 | Potential match — shares topical overlap with the query and is worth exploring further | -| ⭐ 1 | Broad match — loosely connected to the query; included for completeness and may surface unexpected value | - -Present results **sorted by score (highest first)**. For each result, include a one-sentence justification for the score that explains concretely why it ranked that way. Example format: - ---- -**[Collection Name]** — ⭐⭐⭐⭐⭐ 5/5 -- **Why**: [One concrete sentence explaining what in the metadata drove this score — e.g. "Contains whole-genome sequencing data with BRCA1/BRCA2 variant calls across 10,000 patients."] -- **Summary**: [shortDescription] -- **Data types**: [dataModalityTags] -- **Patients**: [patientCount] | **Time frame**: [timeFrame] | **Geography**: [geographicCoverage] -- **Access**: [availability] | Free: [isFree] | Instant: [isInstantlyAccessible] -- **View in Workbench**: [workbenchUrl] ---- - -After presenting results, ask: - -> "Do any of these look useful? Would you like to refine the search or explore a specific collection in more detail?" - -If the user wants deeper detail on a specific collection: -- Use `underlayName` with `mcp__wb__underlay_list_entities` to explore the data schema -- Reference `usageExamples` for sample queries -- Reference `dataDictionary` for table/field documentation - ---- - -## Step 4 — Add to Workspace - -If the user wants to use a data collection: - -1. Provide the direct link to the collection: - > "You can view and request access to **[Collection Name]** here: [workbenchUrl]" - -2. Instruct them to click **"Add to Workspace"** or **"Get Access"** in the Workbench UI. The button label depends on whether the collection is instantly accessible or requires approval. - -3. If the collection is instantly accessible (`isInstantlyAccessible: true`), tell them: - > "This collection is instantly accessible — once you click 'Add to Workspace', the resources will be available in your workspace immediately." - -4. If it requires approval (`isInstantlyAccessible: false`): - > "This collection requires access approval. After you submit the request at [workbenchUrl], access is typically granted after review." - -5. After the user confirms they've added the collection, use `workspace_list_data_collections` to confirm the resources are now visible in their workspace. - ---- - -## Notes - -- `workspace_list_data_collections` only shows collections already attached to the active workspace -- `platform_list_data_collections` searches platform-wide but requires the user to have at least READ access to the collection workspace -DATA_DISCOVERY_EOF - - # Create CUSTOM_APP.md skill (full version, embedded) - log_info "Creating CUSTOM_APP.md skill..." - cat > "${SKILLS_DIR}/CUSTOM_APP.md" << 'SKILL_EOF' -# Creating Custom Workbench Apps - -**Practical guide for creating simple, reliable Workbench apps.** - -> **Official Reference:** https://github.com/verily-src/workbench-app-devcontainers -> -> **Quick Start Script:** Use \`./scripts/create-custom-app.sh\` for auto-generated app structure! - ---- - -## 🚀 Quick Start (Recommended) - -The official repo has a script that generates a complete app structure: - -\`\`\`bash -# Clone the official repo -git clone https://github.com/verily-src/workbench-app-devcontainers.git -cd workbench-app-devcontainers - -# Run the quick start script -./scripts/create-custom-app.sh my-app quay.io/jupyter/base-notebook 8888 jovyan /home/jovyan -\`\`\` - -This generates all required files in \`src/my-app/\` with correct structure. - ---- - -## ⚠️ Critical Requirements - -### 1. File Structure (MUST follow this exactly) - -\`\`\` -your-repo/ -├── .devcontainer.json ← MUST be at repo ROOT (not in a folder!) -├── docker-compose.yaml -├── Dockerfile -├── devcontainer-template.json -└── app/ - └── your_app.py -\`\`\` - -**⚠️ CRITICAL:** Workbench expects \`.devcontainer.json\` at the **repo ROOT**, NOT inside a \`.devcontainer/\` folder! - -### 2. Container Requirements - -Workbench custom apps need exactly **three things**: -1. Container named \`application-server\` -2. Connected to \`app-network\` (external Docker network) -3. HTTP server on a port - ---- - -## The Working Pattern (Copy This) - -### File 1: \`.devcontainer.json\` - -**Location:** Repo ROOT (same level as docker-compose.yaml) - -\`\`\`json -{ - "name": "Your App Name", - "dockerComposeFile": "docker-compose.yaml", - "service": "app", - "shutdownAction": "none", - "workspaceFolder": "/app", - "remoteUser": "root" -} -\`\`\` - -**⚠️ CRITICAL settings:** -- \`"dockerComposeFile": "docker-compose.yaml"\` - Same directory (both at root) -- \`"workspaceFolder": "/app"\` - Should match WORKDIR in Dockerfile -- File MUST be named \`.devcontainer.json\` at repo root - -### File 2: \`docker-compose.yaml\` - -**Location:** Repository root - -\`\`\`yaml -services: - app: - container_name: "application-server" - build: - context: . - dockerfile: Dockerfile - restart: always - ports: - - "8080:8080" - volumes: - - .:/app:cached - networks: - - app-network - -networks: - app-network: - external: true -\`\`\` - -**⚠️ CRITICAL settings:** -- \`container_name: "application-server"\` - Workbench looks for this exact name -- \`networks: app-network\` with \`external: true\` - Required for Workbench connectivity -- \`volumes: - .:/app:cached\` - Mounts code for live updates - -### File 3: \`Dockerfile\` - -\`\`\`dockerfile -FROM python:3.11-slim - -WORKDIR /app - -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -COPY . . - -EXPOSE 8080 - -# CRITICAL: Must bind to 0.0.0.0 for Workbench proxy -CMD ["python", "app.py"] -\`\`\` - -### File 4: \`devcontainer-template.json\` - -\`\`\`json -{ - "id": "your-app-name", - "description": "Your app description", - "version": "1.0.0", - "name": "Your App Name", - "options": {}, - "platforms": ["Any"] -} -\`\`\` - ---- - -## Common Mistakes Checklist - -Before deploying, verify: - -- [ ] \`.devcontainer.json\` is at repo ROOT (NOT in a folder!) -- [ ] \`dockerComposeFile\` is \`"docker-compose.yaml"\` (same directory) -- [ ] \`container_name\` is exactly \`"application-server"\` -- [ ] Network is \`app-network\` with \`external: true\` -- [ ] Flask/server binds to \`0.0.0.0\` (not \`localhost\`) -- [ ] Volume mount included for code updates - ---- - -## ⚠️ Workbench App URLs (CRITICAL) - -**When accessing your app, you MUST use this format:** - -\`\`\` -https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] -\`\`\` - -### Get App UUID: -\`\`\`bash -wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 -\`\`\` - -### ❌ WRONG Formats (Will fail) -\`\`\` -https://abc123-def456.workbench-app.verily.com/ ← WRONG -http://localhost:8080/ ← WRONG -\`\`\` - ---- - -## Flask App Example - -\`\`\`python -from flask import Flask -from flask_cors import CORS - -app = Flask(__name__) -CORS(app) - -@app.route('/') -def index(): - return '

Hello Workbench!

' - -if __name__ == '__main__': - # CRITICAL: host='0.0.0.0' required for Workbench proxy - app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) -\`\`\` - ---- - -## Common Errors and Fixes - -| Error | Cause | Fix | -|-------|-------|-----| -| App fails to create / No container | \`devcontainer.json\` in wrong location | Move to repo ROOT as \`.devcontainer.json\` | -| App fails to create | \`devcontainer.json\` in \`.devcontainer/\` folder | Workbench needs it at ROOT, not in folder | -| "Bad Request" error | Wrong URL format | Use \`workbench.verily.com/app/UUID/proxy/PORT/\` | -| Server not accessible | Bound to \`localhost\` | Change to \`host='0.0.0.0'\` | -| Container restart loop | Process exits immediately | Ensure server runs continuously | - ---- - -## Deployment - -In Workbench UI, create custom app with: -- **Repository:** \`https://github.com/YOUR-ORG/YOUR-REPO.git\` -- **Branch:** \`main\` -- **Folder:** \`.\` (root) or \`src/YOUR-APP-NAME\` if in monorepo - ---- - -## Local Testing - -\`\`\`bash -# Create required network -docker network create app-network - -# Build and run -docker compose build -docker compose up - -# Access at http://localhost:8080 -\`\`\` - ---- - -## Reference Implementations - -All examples: https://github.com/verily-src/workbench-app-devcontainers/tree/master/src - -| App | Description | Port | -|-----|-------------|------| -| \`playground/\` | Simple multi-service example | 8080 | -| \`vscode/\` | VS Code Server | 8443 | -| \`r-analysis/\` | RStudio | 8787 | -| \`workbench-jupyter/\` | JupyterLab with tools | 8888 | - ---- - -## When to Use Features - -Sometimes you need the full-featured approach: - -| Need | Solution | -|------|----------| -| Workbench CLI (\`wb\`) | Use \`workbench-tools\` feature | -| LLM/MCP integration | Use \`wb-mcp-server\` feature | -| Pre-authenticated gcloud | Use \`workbench-tools\` feature | - -**If you need these, use the full \`workbench-app-devcontainers\` repo as your base.** -SKILL_EOF - - # Create APP_TEMPLATES.md skill (full version, embedded) - log_info "Creating APP_TEMPLATES.md skill..." - cat > "${SKILLS_DIR}/APP_TEMPLATES.md" << 'TEMPLATES_SKILL_EOF' -# App Templates for Workbench - -**Pre-built, ready-to-deploy application templates with workspace resource integration.** - -> **When to use this:** User wants an app that visualizes data, serves an API, processes files, or creates dashboards using their workspace resources. - ---- - -## Available Templates - -| Template | Best For | Port | Key Features | -|----------|----------|------|--------------| -| **flask-api** | REST APIs, backend services, data processing | 8080 | JSON endpoints, file upload, BQ queries | -| **streamlit-dashboard** | Data visualization, interactive exploration | 8501 | Charts, file browser, BigQuery explorer | -| **rshiny-dashboard** | R statistical analysis, R-based visualizations | 3838 | Shiny UI, plotly, ggplot2, tidyverse | -| **file-processor** | File upload, validation, transformation | 8080 | Drag-drop UI, auto-save to GCS, schema validation | - ---- - -## Template Selection Guide - -### Quick Decision Matrix - -| User Says... | Recommend | -|--------------|-----------| -| "dashboard", "visualize", "charts", "explore data" | `streamlit-dashboard` | -| "API", "endpoint", "backend", "REST", "service" | `flask-api` | -| "R", "statistical", "ggplot", "tidyverse" | `rshiny-dashboard` | -| "upload", "process files", "validate", "CSV" | `file-processor` | -| "something custom", "from scratch" | → Use `CUSTOM_APP.md` skill | - ---- - -## How to Use a Template - -### Copy and Customize -1. Copy the template folder to user's repo -2. Modify application code in `app/` -3. Update `devcontainer-template.json` with new name/description -4. Push to GitHub and deploy - ---- - -## Template Summaries - -### flask-api (Port 8080) -- REST API with Flask -- Pre-built endpoints: `/health`, `/resources`, `/buckets//files`, `/bigquery/query` -- Easy to add custom endpoints - -### streamlit-dashboard (Port 8501) -- Interactive dashboard with tabs -- GCS file browser, BigQuery explorer, visualization -- Easy to add new tabs/charts - -### rshiny-dashboard (Port 3838) -- R-based Shiny dashboard -- Includes: shiny, shinydashboard, plotly, ggplot2, dplyr, tidyr -- bigrquery and googleCloudStorageR for data access - -### file-processor (Port 8080) -- Drag-drop file upload UI -- Processes CSV, JSON, Excel -- Auto-save to GCS buckets -- Schema validation - ---- - -## Workspace Resource Integration - -All templates auto-detect workspace resources via environment variables: - -```python -# Python -import os -bucket = os.environ.get("WORKBENCH_my_bucket") -``` - -```r -# R -bucket <- Sys.getenv("WORKBENCH_my_bucket") -``` - ---- - -## When Templates Don't Fit - -If no template matches: -1. Check if a template can be extended (usually yes) -2. If truly custom, read `~/.claude/skills/CUSTOM_APP.md` -TEMPLATES_SKILL_EOF - - # Create DASHBOARD_BUILDER.md skill (full version, embedded) - log_info "Creating DASHBOARD_BUILDER.md skill..." - cat > "${SKILLS_DIR}/DASHBOARD_BUILDER.md" << 'DASHBOARD_SKILL_EOF' -# Web Apps & Dashboards Skill - -**Build interactive web apps, dashboards, and visualizations that run on a port in Workbench.** - -> **Triggers:** -> - "Create a dashboard", "visualize data", "build charts" -> - "Run a Flask/Streamlit/FastAPI app" -> - "Display data in the browser", "interactive UI" -> - Any web app that serves content on a port - ---- - -## 🌐 Workbench Proxy & Web Apps Best Practices - -### Proxy URL Format - -The proxy URL is the **only valid way** to access web apps in Workbench: -\`\`\` -https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] -\`\`\` - -Retrieve the App UUID automatically: -\`\`\`bash -wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 -\`\`\` - -### ✅ Correct URL Examples -\`\`\` -https://workbench.verily.com/app/abc123-def456-789/proxy/8080/ -https://workbench.verily.com/app/abc123-def456-789/proxy/8501/index.html -https://workbench.verily.com/app/abc123-def456-789/proxy/8000/dashboard.html -\`\`\` - -### ❌ WRONG URL Formats (These WILL fail) -\`\`\` -https://abc123-def456.workbench-app.verily.com/ ← WRONG: "Bad Request" error -https://workbench-app.verily.com/abc123-def456/ ← WRONG: Invalid domain -http://localhost:8080/ ← WRONG: Not accessible externally -https://abc123-def456/workbench.verily.com/ ← WRONG: Reversed format -file:///home/jupyter/dashboard.html ← WRONG: JavaScript blocked -\`\`\` - -### ⚠️ Common Issue: JavaScript API Calls Failing - -**Problem:** JavaScript using absolute paths fails through Workbench proxy - -**Symptoms:** -- Dashboard loads but shows no data -- Charts remain empty with "-" placeholders -- Browser console shows 404 errors for API calls -- Flask/server logs show requests for \`/\` but NOT \`/api/*\` endpoints - -### ✅ Solution: Use Relative Paths (TESTED & CONFIRMED) - -**Always use relative paths (no leading \`/\`) for fetch/AJAX calls:** - -\`\`\`javascript -// ✅ CORRECT - relative paths work through proxy -fetch('api/metadata') -fetch('api/data?filter=value') - -// ❌ WRONG - absolute paths fail -fetch('/api/metadata') -fetch('/api/data?filter=value') -\`\`\` - -### Alternative: Embed Data in HTML (For Static Dashboards) - -If you don't need dynamic filtering, embed data directly in the template: - -**Python (Flask):** -\`\`\`python -@app.route('/') -def index(): - data = get_data_from_bigquery() - return render_template('dashboard.html', data_json=json.dumps(data)) -\`\`\` - -**HTML Template:** -\`\`\`html - -\`\`\` - -**When to use:** Static dashboards, large datasets that don't change, or when filters can be client-side only. - -### Testing Checklist - -Before deploying: -- [ ] All \`fetch()\` calls use relative paths (\`'api/...'\` not \`'/api/...'\`) -- [ ] Test locally: \`curl http://localhost:PORT/api/endpoint\` -- [ ] Server logs show API requests arriving -- [ ] App UUID obtained (not using placeholder \`[APP_UUID]\`) - ---- - -## Workflow - -### Step 1: Understand Requirements - -Ask the user: -1. **Data source?** BigQuery table, CSV in bucket, or local file? -2. **Visualizations?** Charts (bar, line, scatter), tables, filters? -3. **Interactivity?** Static display or dynamic filtering? - -### Step 2: Auto-Detect Environment - -**Always run these commands first:** - -\`\`\`bash -# Get app UUID (REQUIRED for final URL) -APP_UUID=\$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) -echo "App UUID: \$APP_UUID" - -# Verify Python -python3 --version - -# Check working directory -pwd -\`\`\` - -### Step 3: Install Dependencies - -\`\`\`bash -pip install flask flask-cors pandas plotly google-cloud-bigquery db-dtypes -\`\`\` - -> **Note:** \`db-dtypes\` is required for BigQuery to properly convert data types for pandas. - -### Step 4: Create Dashboard Structure - -\`\`\` -dashboard/ -├── app.py # Flask server -├── templates/ -│ └── index.html # Dashboard HTML -└── static/ - └── style.css # Optional styling -\`\`\` - ---- - -## Working Template: BigQuery Dashboard - -**app.py:** -\`\`\`python -from flask import Flask, render_template, jsonify -from flask_cors import CORS -from google.cloud import bigquery - -app = Flask(__name__) -CORS(app) - -_data_cache = None - -def get_bigquery_data(): - global _data_cache - if _data_cache is not None: - return _data_cache - - client = bigquery.Client() - query = """ - SELECT * - FROM \\\`YOUR_PROJECT.YOUR_DATASET.YOUR_TABLE\\\` - LIMIT 1000 - """ - df = client.query(query).to_dataframe() - _data_cache = df.to_dict(orient='records') - return _data_cache - -@app.route('/') -def index(): - return render_template('index.html') - -@app.route('api/data') # NO leading slash! -def get_data(): - try: - data = get_bigquery_data() - return jsonify(data) - except Exception as e: - return jsonify({"error": str(e)}), 500 - -@app.route('api/metadata') -def get_metadata(): - try: - data = get_bigquery_data() - return jsonify({ - "columns": list(data[0].keys()) if data else [], - "row_count": len(data) - }) - except Exception as e: - return jsonify({"error": str(e)}), 500 - -if __name__ == '__main__': - # CRITICAL: host='0.0.0.0' required for Workbench proxy access - app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) -\`\`\` - -**templates/index.html:** -\`\`\`html - - - - Data Dashboard - - - - -
-

📊 Data Dashboard

-
Loading...
-
Loading...
-
- - - -\`\`\` - ---- - -## Step 5: Test & Launch - -\`\`\`bash -# Get app UUID -APP_UUID=\$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) - -# Start server -cd dashboard -nohup python3 app.py > server.log 2>&1 & - -# Test locally -curl -s http://localhost:8080/api/metadata | jq . - -echo "Dashboard at: https://workbench.verily.com/app/\${APP_UUID}/proxy/8080/" -\`\`\` - ---- - -## ⚠️ Critical Flask Configuration - -\`\`\`python -# ❌ WRONG - proxy cannot reach your app -app.run(host='localhost', port=8080) - -# ✅ CORRECT - accessible through Workbench proxy -app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) -\`\`\` - -**Required settings:** -- \`host='0.0.0.0'\` - Allows external connections (not just localhost) -- \`threaded=True\` - Handles concurrent users -- \`debug=False\` - Security (don't expose debug info) - -**Restart after code changes:** -\`\`\`bash -pkill -f "python3 app.py" -python3 app.py & -\`\`\` - -**Browser not showing changes?** Hard refresh: \`Ctrl+Shift+R\` or \`Cmd+Shift+R\` - ---- - -## Troubleshooting Checklist - -| Issue | Check | Fix | -|-------|-------|-----| -| Data doesn't load | Path format | Change \`fetch('/api/...')\` to \`fetch('api/...')\` | -| 404 errors | Server running? | \`ps aux | grep python\` | -| CORS error | CORS setup | Ensure \`CORS(app)\` is added | -| BQ error | Auth | Check \`gcloud auth list\` | -| Blank page | Console errors | Check browser DevTools | -| Works locally, fails via URL | Host binding | Change \`localhost\` to \`0.0.0.0\` | -| Gateway timeout | Server/UUID | Check server running + correct UUID | -| Address in use | Port conflict | \`kill \$(lsof -t -i :8080)\` | -| Changes not showing | Cache/restart | Hard refresh + restart server | - ---- - -## Common Pitfalls - -- ❌ \`fetch('/api/data')\` — **Use** \`fetch('api/data')\` (no leading slash) -- ❌ \`host='localhost'\` — **Use** \`host='0.0.0.0'\` (allows proxy access) -- ❌ Placeholder \`[APP_UUID]\` — **Always get real UUID** with \`wb app list\` -- ❌ Forgetting to restart server after code changes -- ❌ Not checking server logs when debugging -DASHBOARD_SKILL_EOF - - # Create WORKFLOW_TROUBLESHOOT.md skill (full version, embedded) - log_info "Creating WORKFLOW_TROUBLESHOOT.md skill..." - cat > "${SKILLS_DIR}/WORKFLOW_TROUBLESHOOT.md" << 'WORKFLOW_SKILL_EOF' -# WDL Workflow Troubleshooting Skill - -**Trigger:** User asks to troubleshoot, debug, or fix a failed workflow. - -## ⚡ LLM Behavior: Be Proactive! - -**Once the user confirms which job to investigate, DO NOT ask which diagnostic steps to run.** Instead: -1. **Run all diagnostic commands automatically** (Steps 2-4 at minimum) -2. **Analyze the results** and identify the root cause -3. **Report your diagnosis** with evidence (error messages, exit codes, log snippets) -4. **Propose a fix** with specific changes -5. **THEN ask** if they want you to apply the fix or investigate further - -❌ Don't say: "Would you like me to check the logs?" -✅ Do say: "I checked the logs and found an OOM error. The task requested 8GB but needed more. I recommend increasing memory to 16GB in the runtime block." - ---- - -## Quick Diagnosis (Start Here) - -\`\`\`bash -# 1. Find failed jobs -wb workflow job list --format=json | jq -r '.[] | select(.status=="FAILED") | "\(.id)\t\(.workflowName)\t\(.startTime)"' - -# 2. Get error message (replace JOB_ID) -wb workflow job describe --job= --format=json | jq -r '.failureMessage // "No message"' - -# 3. Find failed task -wb workflow job task list --job= --format=json | jq -r '.[] | select(.status=="FAILED") | .name' - -# 4. Get task error + logs -wb workflow job task describe --job= --task= --format=json | jq '{stderr, stdout, exitCode, failureMessage}' -\`\`\` - -**After running these 4 commands, you'll know:** which job failed, why, which task, and where logs are. - ---- - -## Step-by-Step Guide - -### Step 1: Identify Failed Job - -\`\`\`bash -# List all failed jobs -wb workflow job list --format=json | jq '.[] | select(.status == "FAILED") | {id, workflowName, status, startTime, endTime}' -\`\`\` - -**For batch jobs:** -\`\`\`bash -# List failed sub-jobs within a batch -wb workflow job batch list --job= --format=json | jq '.[] | select(.status == "FAILED") | {id, status}' -\`\`\` - -**Ask user:** Confirm which job ID to investigate (if multiple failed jobs). - ---- - -### Step 2: Get Job Details & Inputs - -\`\`\`bash -wb workflow job describe --job= --format=json | jq '{failureMessage, inputs, outputs}' -\`\`\` - ---- - -### Step 3: Find Failed Task & Get Logs - -\`\`\`bash -# List all tasks with status -wb workflow job task list --job= --format=json | jq '.[] | {name, status, exitCode}' - -# Get failed task details -wb workflow job task describe --job= --task= --format=json -\`\`\` - -**Extract log URLs:** -\`\`\`bash -# Get stderr and stdout URLs -TASK_INFO=\$(wb workflow job task describe --job= --task= --format=json) -STDERR_URL=\$(echo \$TASK_INFO | jq -r '.stderr') -STDOUT_URL=\$(echo \$TASK_INFO | jq -r '.stdout') - -echo "stderr: \$STDERR_URL" -echo "stdout: \$STDOUT_URL" -\`\`\` - ---- - -### Step 4: Pull and Analyze Task Logs - -#### Read Log Contents - -\`\`\`bash -# Read stderr (usually contains errors) -gsutil cat "\$STDERR_URL" 2>/dev/null | tail -100 - -# Read stdout -gsutil cat "\$STDOUT_URL" 2>/dev/null | tail -100 - -# Search for common error patterns -gsutil cat "\$STDERR_URL" 2>/dev/null | grep -i -E "error|exception|failed|denied|killed|oom|memory|disk|timeout" | head -30 -\`\`\` - -#### Common Log File Patterns - -Cromwell execution logs are typically at: -\`\`\` -gs://///execution/ -├── stdout # Task standard output -├── stderr # Task standard error -├── script # The actual command that ran -├── rc # Return code (exit code) -└── script.submit # Submission script -\`\`\` - -**One-liner to read all execution files:** -\`\`\`bash -# Find execution directory from task describe, then: -EXEC_DIR=\$(echo \$TASK_INFO | jq -r '.executionDirectory // empty') -if [ -n "\$EXEC_DIR" ]; then - echo "=== script ===" && gsutil cat "\$EXEC_DIR/script" 2>/dev/null - echo "=== rc ===" && gsutil cat "\$EXEC_DIR/rc" 2>/dev/null - echo "=== stderr (last 50 lines) ===" && gsutil cat "\$EXEC_DIR/stderr" 2>/dev/null | tail -50 -fi -\`\`\` - ---- - -### Step 5: Check Resource Allocation & Usage - -#### What Was Requested (from WDL runtime) - -\`\`\`bash -# Get workflow definition to see runtime requirements -wb workflow describe --workflow= --format=json | jq '.sourceUrl' - -# Read WDL file -gsutil cat gs:////workflow.wdl | grep -A10 "runtime {" -\`\`\` - -#### Check Actual Resource Usage (GCP Batch) - -\`\`\`bash -# For GCP Cromwell jobs, get batch job details -gcloud batch jobs list --filter="status.state=FAILED" --format="table(name,status.state,createTime)" - -# Describe specific batch job -gcloud batch jobs describe --format=json | jq '{ - status: .status.state, - statusEvents: .status.statusEvents, - taskGroups: .taskGroups[0].taskSpec.computeResource -}' -\`\`\` - -#### Memory-Specific Checks - -\`\`\`bash -# Check if OOM (Out of Memory) killed the task -gsutil cat "\$STDERR_URL" 2>/dev/null | grep -i -E "oom|out of memory|killed|cannot allocate|memory" - -# Check what memory was requested in batch job -gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource.memoryMib' - -# Check dmesg/syslog for OOM events (if available in logs) -gsutil cat "\$STDERR_URL" 2>/dev/null | grep -i "killed process" -\`\`\` - ---- - -### Step 6: Diagnose by Error Type - -#### Memory Issues (OOM) - -**Symptoms:** -- Exit code 137 (SIGKILL) or 143 -- "Killed" in stderr -- "Cannot allocate memory" -- Task succeeded locally but fails at scale - -**Diagnosis:** -\`\`\`bash -# Check requested memory -gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.computeResource' - -# Look for memory errors in logs -gsutil cat "\$STDERR_URL" 2>/dev/null | grep -i -E "memory|oom|killed|malloc" -\`\`\` - -**Fix:** Increase \`memory\` in WDL runtime block: -\`\`\`wdl -runtime { - memory: "32G" # Increase from previous value -} -\`\`\` - -#### Disk Issues - -**Symptoms:** -- "No space left on device" -- "Disk quota exceeded" - -**Diagnosis:** -\`\`\`bash -gsutil cat "\$STDERR_URL" 2>/dev/null | grep -i -E "space|disk|quota" -\`\`\` - -**Fix:** Increase disk in WDL runtime: -\`\`\`wdl -runtime { - disks: "local-disk 200 SSD" # Increase size -} -\`\`\` - -#### Input File Issues - -**Symptoms:** -- "FileNotFoundException" -- "Localization failed" -- File not found errors - -**Diagnosis:** -\`\`\`bash -# Check if input files exist -wb workflow job describe --job= --format=json | jq -r '.inputs | to_entries[] | .value' | while read path; do - if [[ \$path == gs://* ]]; then - echo -n "\$path: " && gsutil ls "\$path" 2>&1 | head -1 - fi -done -\`\`\` - -#### Permission Issues - -**Symptoms:** -- "Permission denied" -- "Access denied" -- 403 errors - -**Diagnosis:** -\`\`\`bash -# Check service account permissions -gcloud batch jobs describe --format=json | jq '.taskGroups[0].taskSpec.serviceAccount' - -# Test bucket access -gsutil ls gs:/// 2>&1 | head -5 -\`\`\` - ---- - -### Step 7: Propose Solution - -Based on diagnosis, recommend one of: - -| Issue | Solution Template | -|-------|-------------------| -| **OOM** | "Increase memory from X to Y in the runtime block" | -| **Disk full** | "Increase disk size from X to Y GB" | -| **Missing input** | "Input file doesn't exist. Verify path: \`gsutil ls \`" | -| **Permission** | "Service account lacks access. Grant \`roles/storage.objectViewer\` on bucket" | -| **Timeout** | "Task exceeded time limit. Increase \`maxRetries\` or optimize task" | -| **Docker** | "Image pull failed. Verify image exists and is accessible" | - -**Re-run after fixing:** -\`\`\`bash -wb workflow job run --workflow= --inputs= -\`\`\` - ---- - -## Quick Reference - -### Error → Cause → Fix - -| Exit Code | Meaning | Common Fix | -|-----------|---------|------------| -| 1 | General error | Check stderr for details | -| 2 | Misuse of command | Check script syntax | -| 126 | Permission problem | Check file permissions | -| 127 | Command not found | Check PATH, container image | -| 137 | SIGKILL (OOM) | **Increase memory** | -| 139 | Segfault | Check input data, memory | -| 143 | SIGTERM | Task timeout or preemption | - ---- - -## Workbench-Specific Notes - -- **Log retention:** Cromwell logs persist in workspace execution bucket -- **Batch jobs:** Each sub-job has independent logs; troubleshoot specific failed sub-job -- **VPC-SC:** Run \`gcloud batch\` commands from within workspace app -- **Preemption:** If using spot VMs, set \`preemptible: 0\` for reliability -WORKFLOW_SKILL_EOF - - # Create scientific skills directory and index - log_info "Creating scientific skills..." - mkdir -p "${SKILLS_DIR}/scientific" - - # Create SCIENTIFIC_SKILLS_INDEX.md - cat > "${SKILLS_DIR}/SCIENTIFIC_SKILLS_INDEX.md" << 'SCIENTIFIC_SKILLS_EOF' -# Scientific Skills Index - -**This file routes Claude to domain-specific scientific skills.** -Workbench skills (workflows, dashboards, custom apps) are handled directly by `CLAUDE.md`. - ---- - -## ⚡ Quick Navigation - -| User Says... | Read This Skill | -|--------------|-----------------| -| "single-cell" / "RNA-seq" / "scanpy" / "differential expression" | `scientific/BIOINFORMATICS.md` | -| "molecule" / "SMILES" / "drug" / "RDKit" / "ChEMBL" / "target" | `scientific/DRUG_DISCOVERY.md` | -| "gene" / "protein" / "variant" / "UniProt" / "Ensembl" / "PDB" | `scientific/GENOMICS_DATABASES.md` | -| "machine learning" / "sklearn" / "statistics" / "plot" | `scientific/DATA_ANALYSIS.md` | -| "clinical trial" / "PubMed" / "survival analysis" | `scientific/CLINICAL.md` | - ---- - -## Domain Skills - -### 🧬 Bioinformatics (`scientific/BIOINFORMATICS.md`) -Single-cell analysis, differential expression, sequence analysis, RNA velocity. -**Packages:** scanpy, anndata, biopython, pydeseq2, scvelo - -### 💊 Drug Discovery (`scientific/DRUG_DISCOVERY.md`) -Cheminformatics, molecular ML, bioactivity databases, target identification. -**Packages/APIs:** rdkit, deepchem, chembl, drugbank, opentargets - -### 🔬 Genomics Databases (`scientific/GENOMICS_DATABASES.md`) -Gene annotations, protein data, variant interpretation, 3D structures. -**APIs:** ensembl, uniprot, clinvar, pdb - -### 📊 Data Analysis (`scientific/DATA_ANALYSIS.md`) -Machine learning, statistics, visualization. -**Packages:** scikit-learn, statsmodels, plotly, seaborn - -### 🏥 Clinical (`scientific/CLINICAL.md`) -Clinical trials, literature search, survival analysis. -**APIs:** clinicaltrials.gov, pubmed - ---- - -## Adding New Skills - -To add skills from [claude-scientific-skills](https://github.com/K-Dense-AI/claude-scientific-skills): - -1. Copy the `SKILL.md` file to `scientific/.md` -2. Add a row to the Quick Navigation table above -3. Add a domain section below -SCIENTIFIC_SKILLS_EOF - - # Create BIOINFORMATICS.md - cat > "${SKILLS_DIR}/scientific/BIOINFORMATICS.md" << 'BIOINFO_EOF' -# Bioinformatics Skills - -**Trigger:** Single-cell, RNA-seq, sequences, differential expression, trajectory. - -## Quick Reference -| Task | Package | Import | -|------|---------|--------| -| Single-cell workflow | scanpy | `import scanpy as sc` | -| Differential expression | pydeseq2 | `from pydeseq2 import DeseqDataSet` | -| Sequence analysis | biopython | `from Bio import SeqIO` | -| RNA velocity | scvelo | `import scvelo as scv` | - -## Scanpy Workflow -```python -import scanpy as sc -adata = sc.read_h5ad('data.h5ad') -sc.pp.calculate_qc_metrics(adata, inplace=True) -sc.pp.normalize_total(adata, target_sum=1e4) -sc.pp.log1p(adata) -sc.pp.highly_variable_genes(adata, n_top_genes=2000) -sc.tl.pca(adata) -sc.pp.neighbors(adata) -sc.tl.umap(adata) -sc.tl.leiden(adata) -sc.tl.rank_genes_groups(adata, 'leiden') -sc.pl.umap(adata, color='leiden') -``` - -## PyDESeq2 (Differential Expression) -```python -from pydeseq2.dds import DeseqDataSet -from pydeseq2.ds import DeseqStats -dds = DeseqDataSet(counts=counts.T, metadata=metadata, design_factors='condition') -dds.deseq2() -stat_res = DeseqStats(dds, contrast=['condition', 'treated', 'control']) -results = stat_res.results_df -sig = results[(results['padj'] < 0.05) & (abs(results['log2FoldChange']) > 1)] -``` - -## Biopython -```python -from Bio import SeqIO, Entrez -Entrez.email = "email@example.com" -# Parse FASTA -for record in SeqIO.parse('seq.fasta', 'fasta'): - print(record.id, len(record.seq)) -# NCBI fetch -handle = Entrez.efetch(db="nucleotide", id="NM_001301717", rettype="fasta") -``` - -Install: `pip install scanpy anndata pydeseq2 biopython scvelo` -BIOINFO_EOF - - # Create DRUG_DISCOVERY.md - cat > "${SKILLS_DIR}/scientific/DRUG_DISCOVERY.md" << 'DRUGDISC_EOF' -# Drug Discovery Skills - -**Trigger:** Molecules, SMILES, drugs, fingerprints, ADMET, targets, bioactivity. - -## Quick Reference -| Task | Tool | Access | -|------|------|--------| -| Molecular properties | rdkit | `from rdkit import Chem` | -| ADMET prediction | deepchem | `import deepchem as dc` | -| Bioactivity (IC50, Ki) | ChEMBL | REST API | -| Drug info | DrugBank | REST API | -| Target-disease | Open Targets | GraphQL | - -## RDKit -```python -from rdkit import Chem -from rdkit.Chem import Descriptors, AllChem, DataStructs - -mol = Chem.MolFromSmiles('CC(=O)OC1=CC=CC=C1C(=O)O') # Aspirin -mw = Descriptors.MolWt(mol) -logp = Descriptors.MolLogP(mol) -hbd = Descriptors.NumHDonors(mol) -hba = Descriptors.NumHAcceptors(mol) - -# Fingerprint similarity -fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=2) -fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=2) -similarity = DataStructs.TanimotoSimilarity(fp1, fp2) -``` - -## ChEMBL API -```python -from chembl_webresource_client.new_client import new_client -molecule = new_client.molecule -activity = new_client.activity -# Search compound -aspirin = molecule.filter(pref_name__iexact='aspirin')[0] -# Get activities for target -acts = activity.filter(target_chembl_id='CHEMBL230', pchembl_value__gte=6) -``` - -## Open Targets API -```python -import requests -query = '''query { target(ensemblId: "ENSG00000157764") { - approvedSymbol - associatedDiseases { rows { disease { name } score } } -}}''' -r = requests.post("https://api.platform.opentargets.org/api/v4/graphql", json={'query': query}) -``` - -Install: `pip install rdkit deepchem chembl_webresource_client` -DRUGDISC_EOF - - # Create GENOMICS_DATABASES.md - cat > "${SKILLS_DIR}/scientific/GENOMICS_DATABASES.md" << 'GENOMICS_EOF' -# Genomics Databases Skills - -**Trigger:** Genes, proteins, variants, structures, Ensembl, UniProt, ClinVar, PDB. - -## Quick Reference -| Need | Database | API | -|------|----------|-----| -| Gene annotations | Ensembl | REST | -| Protein data | UniProt | REST | -| Variant pathogenicity | ClinVar | E-utilities | -| 3D structures | PDB | REST | - -## Ensembl -```python -import requests -SERVER = "https://rest.ensembl.org" -# Gene lookup -gene = requests.get(f"{SERVER}/lookup/symbol/homo_sapiens/BRCA1", - headers={"Content-Type": "application/json"}).json() -# Sequence -seq = requests.get(f"{SERVER}/sequence/id/{gene['id']}").json() -``` - -## UniProt -```python -import requests -# Search protein -r = requests.get("https://rest.uniprot.org/uniprotkb/search", - params={"query": "gene:TP53 AND organism_id:9606", "format": "json"}) -# Get by ID -protein = requests.get("https://rest.uniprot.org/uniprotkb/P04637.json").json() -``` - -## ClinVar -```python -from Bio import Entrez -Entrez.email = "email@example.com" -handle = Entrez.esearch(db="clinvar", term="BRCA1[gene] AND pathogenic[clinsig]") -record = Entrez.read(handle) -``` - -## PDB -```python -import requests -# Get structure -structure = requests.get("https://data.rcsb.org/rest/v1/core/entry/1TUP").json() -# Download PDB file -pdb = requests.get("https://files.rcsb.org/download/1TUP.pdb").text -``` - -Install: `pip install biopython requests` -GENOMICS_EOF - - # Create DATA_ANALYSIS.md - cat > "${SKILLS_DIR}/scientific/DATA_ANALYSIS.md" << 'DATAANALYSIS_EOF' -# Data Analysis Skills - -**Trigger:** ML, statistics, visualization, sklearn, regression, clustering, plots. - -## Quick Reference -| Task | Package | Import | -|------|---------|--------| -| ML models | scikit-learn | `from sklearn.ensemble import RandomForestClassifier` | -| Statistics | statsmodels | `import statsmodels.api as sm` | -| Interactive plots | plotly | `import plotly.express as px` | -| Statistical plots | seaborn | `import seaborn as sns` | - -## Scikit-learn -```python -from sklearn.model_selection import train_test_split, cross_val_score -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import classification_report - -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) -model = RandomForestClassifier(n_estimators=100) -model.fit(X_train, y_train) -print(classification_report(y_test, model.predict(X_test))) -cv_scores = cross_val_score(model, X, y, cv=5) -``` - -## Statsmodels -```python -import statsmodels.api as sm -X_const = sm.add_constant(X) -model = sm.OLS(y, X_const).fit() -print(model.summary()) # Full regression output with p-values -``` - -## Plotly -```python -import plotly.express as px -fig = px.scatter(df, x='x', y='y', color='category', hover_data=['name']) -fig.show() -fig = px.histogram(df, x='value', color='group') -fig = px.box(df, x='category', y='value') -``` - -## Seaborn -```python -import seaborn as sns -import matplotlib.pyplot as plt -sns.boxplot(data=df, x='category', y='value', hue='group') -sns.heatmap(df.corr(), annot=True, cmap='coolwarm') -sns.pairplot(df, hue='category') -plt.savefig('plot.png', dpi=300) -``` - -Install: `pip install scikit-learn statsmodels plotly seaborn` -DATAANALYSIS_EOF - - # Create CLINICAL.md - cat > "${SKILLS_DIR}/scientific/CLINICAL.md" << 'CLINICAL_EOF' -# Clinical Skills - -**Trigger:** Clinical trials, PubMed, literature, survival analysis. - -## Quick Reference -| Task | Source | Access | -|------|--------|--------| -| Clinical trials | ClinicalTrials.gov | REST API | -| Literature | PubMed | E-utilities | -| Survival analysis | lifelines | Python | - -## ClinicalTrials.gov API -```python -import requests -BASE = "https://clinicaltrials.gov/api/v2" -# Search trials -r = requests.get(f"{BASE}/studies", params={ - "query.cond": "breast cancer", - "query.intr": "pembrolizumab", - "filter.overallStatus": "RECRUITING" -}) -for study in r.json()['studies']: - info = study['protocolSection']['identificationModule'] - print(f"{info['nctId']}: {info['briefTitle']}") -``` - -## PubMed -```python -from Bio import Entrez -Entrez.email = "email@example.com" -handle = Entrez.esearch(db="pubmed", term="CRISPR cancer[Title/Abstract]", retmax=20) -pmids = Entrez.read(handle)['IdList'] -handle = Entrez.efetch(db="pubmed", id=pmids, rettype="abstract") -print(handle.read()) -``` - -## Survival Analysis (lifelines) -```python -from lifelines import KaplanMeierFitter, CoxPHFitter -from lifelines.statistics import logrank_test - -kmf = KaplanMeierFitter() -kmf.fit(durations, events, label='Survival') -kmf.plot_survival_function() - -# Compare groups -results = logrank_test(dur1, dur2, ev1, ev2) -print(f"p-value: {results.p_value:.4f}") - -# Cox regression -cph = CoxPHFitter() -cph.fit(df, duration_col='time', event_col='event') -cph.print_summary() -``` - -Install: `pip install biopython requests lifelines` -CLINICAL_EOF - - # AWS-specific skill overrides — overwrite only the platform-sensitive skills. - # GCP skills written above are left untouched for GCP workspaces. - if [ "$cloud_platform" = "AWS" ]; then - log_info "Applying AWS skill variants for WORKFLOW_TROUBLESHOOT and DASHBOARD_BUILDER..." - - cat > "${SKILLS_DIR}/WORKFLOW_TROUBLESHOOT.md" << 'AWS_WORKFLOW_SKILL_EOF' -# WDL Workflow Troubleshooting Skill (AWS) - -**Trigger:** User asks to troubleshoot, debug, or fix a failed workflow. - -## Behavior - -Once the target job is identified: -1. Run all diagnostic commands (Steps 2–4) without waiting for further instruction -2. Collect error message, failed task name, logs, and exit code -3. Identify the root cause from the evidence -4. Present the diagnosis with supporting log snippets or error output -5. Propose a specific fix - ---- - -## Quick Diagnosis (Start Here) - -```bash -# 1. Find failed jobs -wb workflow job list --format=json | jq -r '.[] | select(.status=="FAILED") | "\(.id)\t\(.workflowName)\t\(.startTime)"' - -# 2. Get error message (replace JOB_ID) -wb workflow job describe --job= --format=json | jq -r '.failureMessage // "No message"' - -# 3. Find failed task -wb workflow job task list --job= --format=json | jq -r '.[] | select(.status=="FAILED") | .name' -# 4. Get task error + logs -wb workflow job task describe --job= --task= --format=json | jq '{stderr, stdout, exitCode, failureMessage}' -``` - -**After running these 4 commands, you'll know:** which job failed, why, which task, and where logs are. - ---- - -## Step-by-Step Guide - -### Step 1: Identify Failed Job - -```bash -wb workflow job list --format=json | jq '.[] | select(.status == "FAILED") | {id, workflowName, status, startTime, endTime}' -``` - -**For batch jobs:** -```bash -wb workflow job batch list --job= --format=json | jq '.[] | select(.status == "FAILED") | {id, status}' -``` - -**Ask user:** Confirm which job ID to investigate (if multiple failed jobs). - ---- - -### Step 2: Get Job Details & Inputs - -```bash -wb workflow job describe --job= --format=json -``` - -**Key fields to extract:** -```bash -wb workflow job describe --job= --format=json | jq -r '.failureMessage' -wb workflow job describe --job= --format=json | jq '.inputs' -wb workflow job describe --job= --format=json | jq '.outputs' -``` - ---- - -### Step 3: Find Failed Task & Get Logs - -```bash -wb workflow job task list --job= --format=json | jq '.[] | {name, status, exitCode}' -wb workflow job task describe --job= --task= --format=json -``` - -**Extract log URLs:** -```bash -TASK_INFO=$(wb workflow job task describe --job= --task= --format=json) -STDERR_URL=$(echo $TASK_INFO | jq -r '.stderr') -STDOUT_URL=$(echo $TASK_INFO | jq -r '.stdout') -echo "stderr: $STDERR_URL" -echo "stdout: $STDOUT_URL" -``` - ---- - -### Step 4: Pull and Analyze Task Logs - -#### Read Log Contents - -```bash -# Read stderr (usually contains errors) — logs are in S3 -aws s3 cp "$STDERR_URL" - 2>/dev/null | tail -100 - -# Read stdout -aws s3 cp "$STDOUT_URL" - 2>/dev/null | tail -100 - -# Search for common error patterns -aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "error|exception|failed|denied|killed|oom|memory|disk|timeout" | head -30 -``` - -#### Common Log File Patterns - -Cromwell execution logs are typically at: -``` -s3://///execution/ -├── stdout # Task standard output -├── stderr # Task standard error -├── script # The actual command that ran -├── rc # Return code (exit code) -└── script.submit # Submission script -``` - -**One-liner to read all execution files:** -```bash -EXEC_DIR=$(echo $TASK_INFO | jq -r '.executionDirectory // empty') -if [ -n "$EXEC_DIR" ]; then - echo "=== script ===" && aws s3 cp "$EXEC_DIR/script" - 2>/dev/null - echo "=== rc ===" && aws s3 cp "$EXEC_DIR/rc" - 2>/dev/null - echo "=== stderr (last 50 lines) ===" && aws s3 cp "$EXEC_DIR/stderr" - 2>/dev/null | tail -50 -fi -``` - ---- - -### Step 5: Check Resource Allocation & Usage - -#### What Was Requested (from WDL runtime) - -```bash -wb workflow describe --workflow= --format=json | jq '.sourceUrl' - -# Read WDL file -aws s3 cp s3:////workflow.wdl - | grep -A10 "runtime {" -``` - -#### Check Actual Resource Usage (AWS Batch) - -```bash -# List failed AWS Batch jobs -aws batch list-jobs --job-queue --job-status FAILED \ - --query 'jobSummaryList[*].{id:jobId,name:jobName,status:status}' --output table - -# Describe specific batch job -aws batch describe-jobs --jobs | jq '.jobs[0] | { - status: .status, - statusReason: .statusReason, - container: .container.resourceRequirements -}' -``` - -#### Memory-Specific Checks - -```bash -# Check if OOM killed the task -aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "oom|out of memory|killed|cannot allocate|memory" - -# Check what memory was requested in the batch job -aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements[] | select(.type=="MEMORY")' - -# Check for OOM kill signal in stderr -aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i "killed process" -``` - ---- - -### Step 6: Diagnose by Error Type - -#### Memory Issues (OOM) - -**Symptoms:** -- Exit code 137 (SIGKILL) or 143 -- "Killed" in stderr -- "Cannot allocate memory" -- Task succeeded locally but fails at scale - -**Diagnosis:** -```bash -aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements' -aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "memory|oom|killed|malloc" -``` - -**Fix:** Increase `memory` in WDL runtime block: -```wdl -runtime { - memory: "32G" -} -``` - -#### Disk Issues - -**Symptoms:** -- "No space left on device" -- "Disk quota exceeded" - -**Diagnosis:** -```bash -aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "space|disk|quota" -``` - -**Fix:** Increase disk in WDL runtime: -```wdl -runtime { - disks: "local-disk 200 SSD" -} -``` - -#### Input File Issues - -**Symptoms:** -- "FileNotFoundException" -- "Localization failed" -- File not found errors - -**Diagnosis:** -```bash -wb workflow job describe --job= --format=json | jq -r '.inputs | to_entries[] | .value' | while read path; do - if [[ $path == s3://* ]]; then - echo -n "$path: " && aws s3 ls "$path" 2>&1 | head -1 - fi -done -``` - -#### Permission Issues - -**Symptoms:** -- "Permission denied" / "Access denied" / 403 errors - -**Diagnosis:** -```bash -# Check IAM role attached to batch job -aws batch describe-jobs --jobs | jq '.jobs[0].jobDefinition' - -# Test bucket access -aws s3 ls s3:/// 2>&1 | head -5 -``` - ---- - -### Step 7: Propose Solution - -| Issue | Solution Template | -|-------|-------------------| -| **OOM** | "Increase memory from X to Y in the runtime block" | -| **Disk full** | "Increase disk size from X to Y GB" | -| **Missing input** | "Input file doesn't exist. Verify path: `aws s3 ls `" | -| **Permission** | "IAM role lacks S3 access. Grant `s3:GetObject` on the bucket" | -| **Timeout** | "Task exceeded time limit. Increase `maxRetries` or optimize task" | -| **Docker** | "Image pull failed. Verify image exists and is accessible" | -| **Other** | Describe the root cause from logs and propose a fix based on the specific error | - -**Re-run after fixing:** -```bash -wb workflow job run --workflow= --inputs= -``` - ---- - -## Quick Reference - -### Essential Commands - -```bash -# Failed jobs -wb workflow job list --format=json | jq '.[] | select(.status=="FAILED") | {id, workflowName}' - -# Job error -wb workflow job describe --job= --format=json | jq '.failureMessage' - -# Failed tasks -wb workflow job task list --job= --format=json | jq '.[] | select(.status=="FAILED") | .name' - -# Task logs (S3) -wb workflow job task describe --job= --task= --format=json | jq -r '.stderr' | xargs -I{} aws s3 cp {} - | tail -50 - -# Memory check (AWS Batch) -aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements' -``` - -### Error → Cause → Fix - -| Exit Code | Meaning | Common Fix | -|-----------|---------|------------| -| 1 | General error | Check stderr for details | -| 2 | Misuse of command | Check script syntax | -| 126 | Permission problem | Check file permissions | -| 127 | Command not found | Check PATH, container image | -| 137 | SIGKILL (OOM) | **Increase memory** | -| 139 | Segfault | Check input data, memory | -| 143 | SIGTERM | Task timeout or preemption | - ---- - -## Workbench-Specific Notes - -- **Log retention:** Cromwell logs persist in workspace execution bucket (S3) -- **Batch jobs:** Each sub-job has independent logs; troubleshoot specific failed sub-job -- **Preemption:** If using spot instances, set `preemptible: 0` for reliability -AWS_WORKFLOW_SKILL_EOF - - cat > "${SKILLS_DIR}/DASHBOARD_BUILDER.md" << 'AWS_DASHBOARD_SKILL_EOF' -# Web Apps & Dashboards Skill (AWS) - -**Build interactive web apps, dashboards, and visualizations that run on a port in Workbench.** - -> **Triggers:** -> - "Create a dashboard", "visualize data", "build charts" -> - "Run a Flask/Streamlit/FastAPI app" -> - "Display data in the browser", "interactive UI" -> - Any web app that serves content on a port - ---- - -## 🌐 Workbench Proxy & Web Apps Best Practices - -### Proxy URL Format - -All web apps in Workbench are accessed via: -``` -https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] -``` - -### ⚠️ How to Get the App UUID (CRITICAL) - -**You MUST automatically get the app UUID - NEVER ask the user for it.** - -```bash -wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 -``` - -**⚡ LLM INSTRUCTION:** When constructing dashboard/proxy URLs: -1. First run the command above to get the running app UUID -2. Use that actual UUID in the URL you provide -3. Do NOT use placeholders like `[APP_UUID]` in your final response -4. Do NOT ask the user to find/replace the UUID themselves - -### ✅ Correct URL Examples -``` -https://workbench.verily.com/app/abc123-def456-789/proxy/8080/ -https://workbench.verily.com/app/abc123-def456-789/proxy/8501/index.html -``` - -### ❌ WRONG URL Formats (These WILL fail) -``` -https://abc123-def456.workbench-app.verily.com/ ← WRONG -http://localhost:8080/ ← WRONG: Not accessible externally -``` - -### ⚠️ Common Issue: JavaScript API Calls Failing - -**Problem:** JavaScript using absolute paths fails through Workbench proxy - -**Solution: Use Relative Paths (TESTED & CONFIRMED)** - -```javascript -// ✅ CORRECT - relative paths work through proxy -fetch('api/metadata') -fetch('api/data?filter=value') - -// ❌ WRONG - absolute paths fail -fetch('/api/metadata') -fetch('/api/data?filter=value') -``` - ---- - -## Workflow - -### Step 1: Understand Requirements - -Ask the user: -1. **Data source?** S3 file (CSV, Parquet, JSON), Athena query, or local file? -2. **Visualizations?** Charts (bar, line, scatter), tables, filters? -3. **Interactivity?** Static display or dynamic filtering? - -### Step 2: Auto-Detect Environment - -```bash -APP_UUID=$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) -echo "App UUID: $APP_UUID" -python3 --version -pwd -``` - -### Step 3: Install Dependencies - -```bash -pip install flask flask-cors pandas plotly boto3 psycopg2-binary -``` - -### Step 4: Create Dashboard Structure - -``` -dashboard/ -├── app.py -├── templates/ -│ └── index.html -└── static/ - └── style.css -``` - ---- - -## Working Templates - -### Template 1: S3 Data Dashboard - -**app.py:** -```python -from flask import Flask, render_template, jsonify -from flask_cors import CORS -import pandas as pd -import boto3 -import os - -app = Flask(__name__) -CORS(app) - -_data_cache = None - -def get_data_from_s3(): - global _data_cache - if _data_cache is not None: - return _data_cache - - # Use the WORKBENCH_ env var set by Workbench - bucket = os.environ.get('WORKBENCH_my_bucket', 'your-bucket-name') - s3 = boto3.client('s3') - obj = s3.get_object(Bucket=bucket, Key='path/to/data.csv') - df = pd.read_csv(obj['Body']) - _data_cache = df.to_dict(orient='records') - return _data_cache - -@app.route('/') -def index(): - return render_template('index.html') - -@app.route('api/data') # NO leading slash! -def get_data(): - try: - data = get_data_from_s3() - return jsonify(data) - except Exception as e: - return jsonify({"error": str(e)}), 500 - -@app.route('api/metadata') -def get_metadata(): - try: - data = get_data_from_s3() - if data: - return jsonify({"columns": list(data[0].keys()), "row_count": len(data)}) - return jsonify({"columns": [], "row_count": 0}) - except Exception as e: - return jsonify({"error": str(e)}), 500 - -if __name__ == '__main__': - # CRITICAL: host='0.0.0.0' required for Workbench proxy access - app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) -``` - -### Template 2: Aurora PostgreSQL Dashboard - -Aurora in Workbench uses **IAM database authentication** — you cannot connect with a static -password. The correct flow is: - -1. Get temporary AWS credentials via `wb resource credentials` -2. Generate an IAM auth token via boto3 (token is valid for 15 minutes) -3. Connect with `sslmode='require'` — **SSL is mandatory; connections are rejected without it** - -```python -import json, subprocess, boto3, psycopg2, pandas as pd, os - -def get_aurora_connection(resource_id: str, username: str): - """ - Returns an open psycopg2 connection to a Workbench-managed Aurora database. - resource_id: the Workbench resource ID (e.g. 'test-db-1') - username: the IAM database user (check with your workspace admin) - """ - # Step 1 — get temporary AWS credentials from Workbench - result = subprocess.run( - ['wb', 'resource', 'credentials', - f'--id={resource_id}', '--scope=WRITE_READ', '--format=json'], - capture_output=True, text=True, check=True - ) - creds = json.loads(result.stdout) - - # Step 2 — parse connection details from WORKBENCH_* env var - # Format: "host:port/dbname" e.g. "abc.cluster.us-west-2.rds.amazonaws.com:5432/mydb" - conn_str = os.environ.get(f'WORKBENCH_{resource_id.replace("-", "_")}', '') - host_part, _, dbname = conn_str.partition('/') - host, _, port = host_part.partition(':') - port = int(port) if port else 5432 - - # Step 3 — generate IAM auth token (valid 15 min) - session = boto3.Session( - aws_access_key_id=creds['AccessKeyId'], - aws_secret_access_key=creds['SecretAccessKey'], - aws_session_token=creds['SessionToken'], - region_name='us-west-2' - ) - auth_token = session.client('rds').generate_db_auth_token( - DBHostname=host, Port=port, DBUsername=username, Region='us-west-2' - ) - - # Step 4 — connect with SSL (REQUIRED — Aurora rejects unencrypted connections) - return psycopg2.connect( - host=host, port=port, database=dbname, - user=username, password=auth_token, - sslmode='require' # mandatory — omitting this causes "PAM authentication failed" - ) - -def get_data_from_aurora(): - global _data_cache - if _data_cache is not None: - return _data_cache - conn = get_aurora_connection('test-db-1', 'your-iam-username') - df = pd.read_sql('SELECT * FROM your_table LIMIT 1000', conn) - conn.close() - _data_cache = df.to_dict(orient='records') - return _data_cache -``` - -> **Why IAM auth?** Workbench-managed Aurora databases are configured for IAM authentication only. -> Static passwords will fail with "PAM authentication failed" or "pg_hba.conf rejects connection". - -### Alternative: Embed Data in HTML (For Static Dashboards) - -```python -import json -@app.route('/') -def index(): - data = get_data_from_s3() - return render_template('dashboard.html', data_json=json.dumps(data)) -``` - -```html - -``` - ---- - -## Troubleshooting - -### No data showing - -**1. Test API directly:** -```bash -curl http://localhost:8080/api/data | python3 -m json.tool | head -20 -``` - -**2. Check S3 access:** -```bash -aws s3 ls s3:///path/to/data.csv -``` - -**3. Check server logs:** -```bash -tail -f server.log -``` - -### Server won't start - -```bash -lsof -i :8080 -kill $(lsof -t -i :8080) -python3 app.py -``` - -### S3 / AWS errors - -```bash -# Check AWS credentials -aws sts get-caller-identity - -# Test S3 access -aws s3 ls s3:/// - -# Check env vars set by Workbench -env | grep WORKBENCH -``` - -### Aurora connection errors - -Aurora requires IAM authentication + SSL. Plain password connections are rejected. - -**Symptoms and causes:** -- `"PAM authentication failed"` → not using IAM auth token as password -- `"pg_hba.conf rejects connection... no encryption"` → missing `sslmode='require'` -- `"SSL connection is required"` → same SSL issue - -**Step-by-step fix:** - -```bash -# 1. Get temporary credentials from Workbench (scoped to this resource) -wb resource credentials --id= --scope=WRITE_READ --format=json -# Returns: {"AccessKeyId":"...","SecretAccessKey":"...","SessionToken":"..."} -``` - -```python -import boto3, psycopg2, json, subprocess - -# 2. Generate IAM auth token -result = subprocess.run( - ['wb', 'resource', 'credentials', '--id=', '--scope=WRITE_READ', '--format=json'], - capture_output=True, text=True, check=True -) -creds = json.loads(result.stdout) - -session = boto3.Session( - aws_access_key_id=creds['AccessKeyId'], - aws_secret_access_key=creds['SecretAccessKey'], - aws_session_token=creds['SessionToken'], - region_name='us-west-2' -) -auth_token = session.client('rds').generate_db_auth_token( - DBHostname='', Port=5432, - DBUsername='', Region='us-west-2' -) - -# 3. Connect with SSL enabled (mandatory) -conn = psycopg2.connect( - host='', port=5432, database='', - user='', password=auth_token, - sslmode='require' # CRITICAL — without this, connection is rejected -) -``` - -**AWS CLI alternative (to verify the token works):** -```bash -# Export the credentials first -export AWS_ACCESS_KEY_ID="..." -export AWS_SECRET_ACCESS_KEY="..." -export AWS_SESSION_TOKEN="..." - -# Generate auth token -TOKEN=$(aws rds generate-db-auth-token \ - --hostname --port 5432 \ - --region us-west-2 --username ) - -# Connect (psql requires SSL flag) -PGSSLMODE=require psql "host= port=5432 dbname= user= password=$TOKEN" -``` - -### Server not accessible through proxy - -**Fix:** Ensure Flask is bound to `0.0.0.0`, not `localhost`: -```python -app.run(host='0.0.0.0', port=8080) -``` - ---- - -## Common Pitfalls Checklist - -- [ ] **Relative paths** - All `fetch()` calls use `'api/...'` not `'/api/...'` -- [ ] **Host is 0.0.0.0** - Not `localhost` or `127.0.0.1` -- [ ] **threaded=True** - For concurrent users -- [ ] **debug=False** - For security -- [ ] **App UUID obtained** - Not using placeholder `[APP_UUID]` -- [ ] **S3 access verified** - `aws s3 ls s3:///` returns files -- [ ] **Data cached** - Avoid repeated S3 reads -- [ ] **Error handling** - API returns errors as JSON, not crashes -- [ ] **CORS enabled** - `CORS(app)` added -- [ ] **Aurora: IAM auth** - Using `wb resource credentials` + boto3 token, not a static password -- [ ] **Aurora: SSL enabled** - `sslmode='require'` in psycopg2.connect() - ---- - -## Quick Reference - -| Issue | Check | Fix | -|-------|-------|-----| -| 404 on API | Path format | Remove leading `/` from fetch | -| CORS error | CORS setup | Add `CORS(app)` | -| Blank page | Server running? | `ps aux \| grep python` | -| S3 error | AWS credentials | `aws sts get-caller-identity` | -| Wrong port | URL vs code | Match port in URL to `app.run()` | -| Works locally, fails via URL | Host binding | Change `localhost` to `0.0.0.0` | -| Gateway timeout | Server/UUID | Check server running + correct UUID | -| Aurora: PAM auth failed | IAM auth | Use `wb resource credentials` + boto3 token | -| Aurora: no encryption | SSL missing | Add `sslmode='require'` to psycopg2.connect() | - ---- - -## Example Prompts This Skill Handles + if [[ ! -d "${source_skills}" ]]; then + log_warn "Skill source directory not found at ${source_skills}, skipping skill installation" + return + fi -- "Create a dashboard showing data from my S3 bucket" -- "Build an interactive chart for analyzing patient demographics" -- "Visualize the CSV files in my bucket" -- "Make a web dashboard with filters for exploring data" -- "Display query results in a browser with charts" -AWS_DASHBOARD_SKILL_EOF + # Copy all base skill files + for skill_file in "${source_skills}"/*.md; do + [[ -f "${skill_file}" ]] && cp "${skill_file}" "${SKILLS_DIR}/" + done + + # Copy scientific skills + if [[ -d "${source_skills}/scientific" ]]; then + mkdir -p "${SKILLS_DIR}/scientific" + for skill_file in "${source_skills}/scientific"/*.md; do + [[ -f "${skill_file}" ]] && cp "${skill_file}" "${SKILLS_DIR}/scientific/" + done + fi + # AWS-specific skill overrides — overwrite only the platform-sensitive skills. + if [ "$cloud_platform" = "AWS" ] && [[ -d "${source_skills}/aws" ]]; then + log_info "Applying AWS skill variants for WORKFLOW_TROUBLESHOOT and DASHBOARD_BUILDER..." + for skill_file in "${source_skills}/aws"/*.md; do + [[ -f "${skill_file}" ]] && cp "${skill_file}" "${SKILLS_DIR}/" + done log_info "AWS skill variants applied." fi + + log_info "Skill files installed." } # Fetch workspace information diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh index 3fefd2443..5acb30757 100644 --- a/features/src/llm-context/install.sh +++ b/features/src/llm-context/install.sh @@ -81,11 +81,14 @@ else exit 1 fi -# Create user-specific directories -USER_WORKBENCH_DIR="${USER_HOME_DIR}/.workbench" -USER_SKILLS_DIR="${USER_WORKBENCH_DIR}/skills" -mkdir -p "${USER_WORKBENCH_DIR}" -mkdir -p "${USER_SKILLS_DIR}" +# Copy skill files to installation directory +if [[ -d "${FEATURE_DIR}/skills" ]]; then + mkdir -p "${LLM_CONTEXT_DIR}/skills" + cp -r "${FEATURE_DIR}/skills/." "${LLM_CONTEXT_DIR}/skills/" + echo "Copied skill files to ${LLM_CONTEXT_DIR}/skills" +else + echo "Warning: skills directory not found in ${FEATURE_DIR}" +fi # Create a wrapper script that runs with proper user context cat > "${LLM_CONTEXT_DIR}/run-context-generator.sh" << WRAPPER_EOF @@ -99,7 +102,7 @@ cat > "${LLM_CONTEXT_DIR}/run-context-generator.sh" << WRAPPER_EOF MAX_RETRIES=8 RETRY_DELAY=10 for i in \$(seq 1 \${MAX_RETRIES}); do - if command -v wb &> /dev/null && wb workspace describe &> /dev/null 2>&1; then + if command -v wb &> /dev/null && wb workspace describe &> /dev/null; then echo "Workspace ready (attempt \${i}). Generating LLM context..." ${GENERATE_SCRIPT} || echo "LLM context generation failed (non-fatal)" exit 0 @@ -117,15 +120,17 @@ chmod +x "${LLM_CONTEXT_DIR}/run-context-generator.sh" chown -R "${USERNAME}:" "${LLM_CONTEXT_DIR}" 2>/dev/null || true chown -R "${USERNAME}:" "${USER_WORKBENCH_DIR}" 2>/dev/null || true -# Add aliases and environment to bashrc -{ - echo "" - echo "# LLM Context Generator" - echo "export LLM_CONTEXT_ENABLED=true" - echo "export LLM_CONTEXT_HOME=\"${USER_HOME_DIR}\"" - echo "alias generate-llm-context='${GENERATE_SCRIPT} ${USER_HOME_DIR}'" - echo "alias refresh-context='${GENERATE_SCRIPT} ${USER_HOME_DIR}'" -} >> "${USER_HOME_DIR}/.bashrc" +# Add aliases and environment to bashrc (idempotent) +if ! grep -q "# LLM Context Generator" "${USER_HOME_DIR}/.bashrc" 2>/dev/null; then + { + echo "" + echo "# LLM Context Generator" + echo "export LLM_CONTEXT_ENABLED=true" + echo "export LLM_CONTEXT_HOME=\"${USER_HOME_DIR}\"" + echo "alias generate-llm-context='${GENERATE_SCRIPT} ${USER_HOME_DIR}'" + echo "alias refresh-context='${GENERATE_SCRIPT} ${USER_HOME_DIR}'" + } >> "${USER_HOME_DIR}/.bashrc" +fi # Make sure the login user is the owner of their .bashrc chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" 2>/dev/null || true diff --git a/features/src/llm-context/skills/APP_TEMPLATES.md b/features/src/llm-context/skills/APP_TEMPLATES.md index bc7237b95..00de407eb 100644 --- a/features/src/llm-context/skills/APP_TEMPLATES.md +++ b/features/src/llm-context/skills/APP_TEMPLATES.md @@ -47,36 +47,41 @@ ## Template Locations -All templates are in: +The official app repository contains reference implementations and examples: ``` -https://github.com/aculotti-verily/wb-app-mcp-and-context/tree/templates-only/src/templates/ +https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/ ``` -Each template contains: -- `manifest.yaml` - Capabilities and inputs +Good starting points: +- [`example`](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/example) — Minimal standalone app (ttyd terminal) +- [`workbench-vscode`](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/workbench-vscode) — Full-featured VS Code Server + +Each app contains: - `.devcontainer.json` - Devcontainer config - `docker-compose.yaml` - Container setup - `Dockerfile` - Build instructions -- `app/` - Application code -- `README.md` - Documentation +- `devcontainer-template.json` - Template metadata +- Application code --- ## How to Use a Template -### Option 1: Deploy Directly -``` -Repository: https://github.com/aculotti-verily/wb-app-mcp-and-context.git -Branch: templates-only -Folder: src/templates/ -``` +### Recommended: Fork and Customize + +The official repo (`verily-src/workbench-app-devcontainers`) is a curated collection of common/default apps. **Create a fork** for your custom app rather than submitting a PR to the org repo: + +1. Fork https://github.com/verily-src/workbench-app-devcontainers +2. Copy an existing app folder (e.g., `src/example`) to `src/my-app` +3. Modify application code +4. Update `devcontainer-template.json` with new name/description +5. Push to your fork +6. Deploy from your fork's repo URL -### Option 2: Copy and Customize -1. Copy the template folder to user's repo -2. Modify application code in `app/` -3. Update `devcontainer-template.json` with new name/description -4. Push to GitHub -5. Deploy from user's repo +### Alternative: Standalone Repo +1. Copy the template files to a new repository +2. Ensure `.devcontainer.json` is at the repo root +3. Push to GitHub and deploy from your repo > ⚠️ Volume mounts (`volumes: .:/workspace`) are for local dev only. In production, Workbench builds the image — code must be baked in via `COPY` in the Dockerfile. Do not rely on volume mounts for deployed apps. diff --git a/features/src/llm-context/skills/CUSTOM_APP.md b/features/src/llm-context/skills/CUSTOM_APP.md index 4e592c7d1..7c16e6367 100644 --- a/features/src/llm-context/skills/CUSTOM_APP.md +++ b/features/src/llm-context/skills/CUSTOM_APP.md @@ -5,6 +5,23 @@ --- +## Quick Start (Recommended) + +The official repo has a script that generates a complete app structure: + +```bash +# Clone the official repo +git clone https://github.com/verily-src/workbench-app-devcontainers.git +cd workbench-app-devcontainers + +# Run the quick start script +./scripts/create-custom-app.sh my-app quay.io/jupyter/base-notebook 8888 jovyan /home/jovyan +``` + +This generates all required files in `src/my-app/` with correct structure. + +--- + ## ⚠️ Choose Your Pattern | Pattern | Use When | Example | @@ -348,3 +365,17 @@ ls -la /home/core/devcontainer/ | `Failed to clone devcontainer GitHub repo` | GitHub access issue | Check repo permissions | | `Container exited with code 1` | App crash | Check `docker logs application-server` | | `proxy-agent or application-server is not started` | Container never started | Check earlier logs | + +--- + +## When to Use Features + +Sometimes you need the full-featured approach: + +| Need | Solution | +|------|----------| +| Workbench CLI (`wb`) | Use `workbench-tools` feature | +| LLM/MCP integration | Use `wb-mcp-server` feature | +| Pre-authenticated gcloud | Use `workbench-tools` feature | + +**If you need these, use the full `workbench-app-devcontainers` repo as your base.** diff --git a/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md index c9bde06f9..93672acc8 100644 --- a/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md +++ b/features/src/llm-context/skills/WORKFLOW_TROUBLESHOOT.md @@ -4,12 +4,15 @@ ## Behavior -Once the target job is identified: -1. Run all diagnostic commands (Steps 2–4) without waiting for further instruction -2. Collect error message, failed task name, logs, and exit code -3. Identify the root cause from the evidence -4. Present the diagnosis with supporting log snippets or error output -5. Propose a specific fix +**Once the user confirms which job to investigate, DO NOT ask which diagnostic steps to run.** Instead: +1. **Run all diagnostic commands automatically** (Steps 2–4 at minimum) +2. **Analyze the results** and identify the root cause +3. **Report your diagnosis** with evidence (error messages, exit codes, log snippets) +4. **Propose a fix** with specific changes +5. **THEN ask** if they want you to apply the fix or investigate further + +Don't say: "Would you like me to check the logs?" +Do say: "I checked the logs and found an OOM error. The task requested 8GB but needed more. I recommend increasing memory to 16GB in the runtime block." --- diff --git a/features/src/llm-context/skills/aws/DASHBOARD_BUILDER.md b/features/src/llm-context/skills/aws/DASHBOARD_BUILDER.md new file mode 100644 index 000000000..08062a495 --- /dev/null +++ b/features/src/llm-context/skills/aws/DASHBOARD_BUILDER.md @@ -0,0 +1,393 @@ +# Web Apps & Dashboards Skill (AWS) + +**Build interactive web apps, dashboards, and visualizations that run on a port in Workbench.** + +> **Triggers:** +> - "Create a dashboard", "visualize data", "build charts" +> - "Run a Flask/Streamlit/FastAPI app" +> - "Display data in the browser", "interactive UI" +> - Any web app that serves content on a port + +--- + +## Workbench Proxy & Web Apps Best Practices + +### Proxy URL Format + +All web apps in Workbench are accessed via: +``` +https://workbench.verily.com/app/[APP_UUID]/proxy/[PORT]/[PATH] +``` + +### How to Get the App UUID (CRITICAL) + +**You MUST automatically get the app UUID - NEVER ask the user for it.** + +```bash +wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1 +``` + +**LLM INSTRUCTION:** When constructing dashboard/proxy URLs: +1. First run the command above to get the running app UUID +2. Use that actual UUID in the URL you provide +3. Do NOT use placeholders like `[APP_UUID]` in your final response +4. Do NOT ask the user to find/replace the UUID themselves + +### Correct URL Examples +``` +https://workbench.verily.com/app/abc123-def456-789/proxy/8080/ +https://workbench.verily.com/app/abc123-def456-789/proxy/8501/index.html +``` + +### WRONG URL Formats (These WILL fail) +``` +https://abc123-def456.workbench-app.verily.com/ <- WRONG +http://localhost:8080/ <- WRONG: Not accessible externally +``` + +### Common Issue: JavaScript API Calls Failing + +**Problem:** JavaScript using absolute paths fails through Workbench proxy + +**Solution: Use Relative Paths (TESTED & CONFIRMED)** + +```javascript +// CORRECT - relative paths work through proxy +fetch('api/metadata') +fetch('api/data?filter=value') + +// WRONG - absolute paths fail +fetch('/api/metadata') +fetch('/api/data?filter=value') +``` + +--- + +## Workflow + +### Step 1: Understand Requirements + +Ask the user: +1. **Data source?** S3 file (CSV, Parquet, JSON), Athena query, or local file? +2. **Visualizations?** Charts (bar, line, scatter), tables, filters? +3. **Interactivity?** Static display or dynamic filtering? + +### Step 2: Auto-Detect Environment + +```bash +APP_UUID=$(wb app list --format=json | jq -r '.[] | select(.status == "RUNNING") | .id' | head -1) +echo "App UUID: $APP_UUID" +python3 --version +pwd +``` + +### Step 3: Install Dependencies + +```bash +pip install flask flask-cors pandas plotly boto3 psycopg2-binary +``` + +### Step 4: Create Dashboard Structure + +``` +dashboard/ +├── app.py +├── templates/ +│ └── index.html +└── static/ + └── style.css +``` + +--- + +## Working Templates + +### Template 1: S3 Data Dashboard + +**app.py:** +```python +from flask import Flask, render_template, jsonify +from flask_cors import CORS +import pandas as pd +import boto3 +import os + +app = Flask(__name__) +CORS(app) + +_data_cache = None + +def get_data_from_s3(): + global _data_cache + if _data_cache is not None: + return _data_cache + + # Use the WORKBENCH_ env var set by Workbench + bucket = os.environ.get('WORKBENCH_my_bucket', 'your-bucket-name') + s3 = boto3.client('s3') + obj = s3.get_object(Bucket=bucket, Key='path/to/data.csv') + df = pd.read_csv(obj['Body']) + _data_cache = df.to_dict(orient='records') + return _data_cache + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('api/data') # NO leading slash! +def get_data(): + try: + data = get_data_from_s3() + return jsonify(data) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +@app.route('api/metadata') +def get_metadata(): + try: + data = get_data_from_s3() + if data: + return jsonify({"columns": list(data[0].keys()), "row_count": len(data)}) + return jsonify({"columns": [], "row_count": 0}) + except Exception as e: + return jsonify({"error": str(e)}), 500 + +if __name__ == '__main__': + # CRITICAL: host='0.0.0.0' required for Workbench proxy access + app.run(host='0.0.0.0', port=8080, debug=False, threaded=True) +``` + +### Template 2: Aurora PostgreSQL Dashboard + +Aurora in Workbench uses **IAM database authentication** — you cannot connect with a static +password. The correct flow is: + +1. Get temporary AWS credentials via `wb resource credentials` +2. Generate an IAM auth token via boto3 (token is valid for 15 minutes) +3. Connect with `sslmode='require'` — **SSL is mandatory; connections are rejected without it** + +```python +import json, subprocess, boto3, psycopg2, pandas as pd, os + +def get_aurora_connection(resource_id: str, username: str): + """ + Returns an open psycopg2 connection to a Workbench-managed Aurora database. + resource_id: the Workbench resource ID (e.g. 'test-db-1') + username: the IAM database user (check with your workspace admin) + """ + # Step 1 — get temporary AWS credentials from Workbench + result = subprocess.run( + ['wb', 'resource', 'credentials', + f'--id={resource_id}', '--scope=WRITE_READ', '--format=json'], + capture_output=True, text=True, check=True + ) + creds = json.loads(result.stdout) + + # Step 2 — parse connection details from WORKBENCH_* env var + # Format: "host:port/dbname" e.g. "abc.cluster.us-west-2.rds.amazonaws.com:5432/mydb" + conn_str = os.environ.get(f'WORKBENCH_{resource_id.replace("-", "_")}', '') + host_part, _, dbname = conn_str.partition('/') + host, _, port = host_part.partition(':') + port = int(port) if port else 5432 + + # Step 3 — generate IAM auth token (valid 15 min) + session = boto3.Session( + aws_access_key_id=creds['AccessKeyId'], + aws_secret_access_key=creds['SecretAccessKey'], + aws_session_token=creds['SessionToken'], + region_name='us-west-2' + ) + auth_token = session.client('rds').generate_db_auth_token( + DBHostname=host, Port=port, DBUsername=username, Region='us-west-2' + ) + + # Step 4 — connect with SSL (REQUIRED — Aurora rejects unencrypted connections) + return psycopg2.connect( + host=host, port=port, database=dbname, + user=username, password=auth_token, + sslmode='require' # mandatory — omitting this causes "PAM authentication failed" + ) + +def get_data_from_aurora(): + global _data_cache + if _data_cache is not None: + return _data_cache + conn = get_aurora_connection('test-db-1', 'your-iam-username') + df = pd.read_sql('SELECT * FROM your_table LIMIT 1000', conn) + conn.close() + _data_cache = df.to_dict(orient='records') + return _data_cache +``` + +> **Why IAM auth?** Workbench-managed Aurora databases are configured for IAM authentication only. +> Static passwords will fail with "PAM authentication failed" or "pg_hba.conf rejects connection". + +### Alternative: Embed Data in HTML (For Static Dashboards) + +```python +import json +@app.route('/') +def index(): + data = get_data_from_s3() + return render_template('dashboard.html', data_json=json.dumps(data)) +``` + +```html + +``` + +--- + +## Troubleshooting + +### No data showing + +**1. Test API directly:** +```bash +curl http://localhost:8080/api/data | python3 -m json.tool | head -20 +``` + +**2. Check S3 access:** +```bash +aws s3 ls s3:///path/to/data.csv +``` + +**3. Check server logs:** +```bash +tail -f server.log +``` + +### Server won't start + +```bash +lsof -i :8080 +kill $(lsof -t -i :8080) +python3 app.py +``` + +### S3 / AWS errors + +```bash +# Check AWS credentials +aws sts get-caller-identity + +# Test S3 access +aws s3 ls s3:/// + +# Check env vars set by Workbench +env | grep WORKBENCH +``` + +### Aurora connection errors + +Aurora requires IAM authentication + SSL. Plain password connections are rejected. + +**Symptoms and causes:** +- `"PAM authentication failed"` -> not using IAM auth token as password +- `"pg_hba.conf rejects connection... no encryption"` -> missing `sslmode='require'` +- `"SSL connection is required"` -> same SSL issue + +**Step-by-step fix:** + +```bash +# 1. Get temporary credentials from Workbench (scoped to this resource) +wb resource credentials --id= --scope=WRITE_READ --format=json +# Returns: {"AccessKeyId":"...","SecretAccessKey":"...","SessionToken":"..."} +``` + +```python +import boto3, psycopg2, json, subprocess + +# 2. Generate IAM auth token +result = subprocess.run( + ['wb', 'resource', 'credentials', '--id=', '--scope=WRITE_READ', '--format=json'], + capture_output=True, text=True, check=True +) +creds = json.loads(result.stdout) + +session = boto3.Session( + aws_access_key_id=creds['AccessKeyId'], + aws_secret_access_key=creds['SecretAccessKey'], + aws_session_token=creds['SessionToken'], + region_name='us-west-2' +) +auth_token = session.client('rds').generate_db_auth_token( + DBHostname='', Port=5432, + DBUsername='', Region='us-west-2' +) + +# 3. Connect with SSL enabled (mandatory) +conn = psycopg2.connect( + host='', port=5432, database='', + user='', password=auth_token, + sslmode='require' # CRITICAL — without this, connection is rejected +) +``` + +**AWS CLI alternative (to verify the token works):** +```bash +# Export the credentials first +export AWS_ACCESS_KEY_ID="..." +export AWS_SECRET_ACCESS_KEY="..." +export AWS_SESSION_TOKEN="..." + +# Generate auth token +TOKEN=$(aws rds generate-db-auth-token \ + --hostname --port 5432 \ + --region us-west-2 --username ) + +# Connect (psql requires SSL flag) +PGSSLMODE=require psql "host= port=5432 dbname= user= password=$TOKEN" +``` + +### Server not accessible through proxy + +**Fix:** Ensure Flask is bound to `0.0.0.0`, not `localhost`: +```python +app.run(host='0.0.0.0', port=8080) +``` + +--- + +## Common Pitfalls Checklist + +- [ ] **Relative paths** - All `fetch()` calls use `'api/...'` not `'/api/...'` +- [ ] **Host is 0.0.0.0** - Not `localhost` or `127.0.0.1` +- [ ] **threaded=True** - For concurrent users +- [ ] **debug=False** - For security +- [ ] **App UUID obtained** - Not using placeholder `[APP_UUID]` +- [ ] **S3 access verified** - `aws s3 ls s3:///` returns files +- [ ] **Data cached** - Avoid repeated S3 reads +- [ ] **Error handling** - API returns errors as JSON, not crashes +- [ ] **CORS enabled** - `CORS(app)` added +- [ ] **Aurora: IAM auth** - Using `wb resource credentials` + boto3 token, not a static password +- [ ] **Aurora: SSL enabled** - `sslmode='require'` in psycopg2.connect() + +--- + +## Quick Reference + +| Issue | Check | Fix | +|-------|-------|-----| +| 404 on API | Path format | Remove leading `/` from fetch | +| CORS error | CORS setup | Add `CORS(app)` | +| Blank page | Server running? | `ps aux \| grep python` | +| S3 error | AWS credentials | `aws sts get-caller-identity` | +| Wrong port | URL vs code | Match port in URL to `app.run()` | +| Works locally, fails via URL | Host binding | Change `localhost` to `0.0.0.0` | +| Gateway timeout | Server/UUID | Check server running + correct UUID | +| Aurora: PAM auth failed | IAM auth | Use `wb resource credentials` + boto3 token | +| Aurora: no encryption | SSL missing | Add `sslmode='require'` to psycopg2.connect() | + +--- + +## Example Prompts This Skill Handles + +- "Create a dashboard showing data from my S3 bucket" +- "Build an interactive chart for analyzing patient demographics" +- "Visualize the CSV files in my bucket" +- "Make a web dashboard with filters for exploring data" +- "Display query results in a browser with charts" diff --git a/features/src/llm-context/skills/aws/WORKFLOW_TROUBLESHOOT.md b/features/src/llm-context/skills/aws/WORKFLOW_TROUBLESHOOT.md new file mode 100644 index 000000000..9befb708a --- /dev/null +++ b/features/src/llm-context/skills/aws/WORKFLOW_TROUBLESHOOT.md @@ -0,0 +1,300 @@ +# WDL Workflow Troubleshooting Skill (AWS) + +**Trigger:** User asks to troubleshoot, debug, or fix a failed workflow. + +## Behavior + +**Once the user confirms which job to investigate, DO NOT ask which diagnostic steps to run.** Instead: +1. **Run all diagnostic commands automatically** (Steps 2–4 at minimum) +2. **Analyze the results** and identify the root cause +3. **Report your diagnosis** with evidence (error messages, exit codes, log snippets) +4. **Propose a fix** with specific changes +5. **THEN ask** if they want you to apply the fix or investigate further + +Don't say: "Would you like me to check the logs?" +Do say: "I checked the logs and found an OOM error. The task requested 8GB but needed more. I recommend increasing memory to 16GB in the runtime block." + +--- + +## Quick Diagnosis (Start Here) + +```bash +# 1. Find failed jobs +wb workflow job list --format=json | jq -r '.[] | select(.status=="FAILED") | "\(.id)\t\(.workflowName)\t\(.startTime)"' + +# 2. Get error message (replace JOB_ID) +wb workflow job describe --job= --format=json | jq -r '.failureMessage // "No message"' + +# 3. Find failed task +wb workflow job task list --job= --format=json | jq -r '.[] | select(.status=="FAILED") | .name' + +# 4. Get task error + logs +wb workflow job task describe --job= --task= --format=json | jq '{stderr, stdout, exitCode, failureMessage}' +``` + +**After running these 4 commands, you'll know:** which job failed, why, which task, and where logs are. + +--- + +## Step-by-Step Guide + +### Step 1: Identify Failed Job + +```bash +wb workflow job list --format=json | jq '.[] | select(.status == "FAILED") | {id, workflowName, status, startTime, endTime}' +``` + +**For batch jobs:** +```bash +wb workflow job batch list --job= --format=json | jq '.[] | select(.status == "FAILED") | {id, status}' +``` + +**Ask user:** Confirm which job ID to investigate (if multiple failed jobs). + +--- + +### Step 2: Get Job Details & Inputs + +```bash +wb workflow job describe --job= --format=json +``` + +**Key fields to extract:** +```bash +wb workflow job describe --job= --format=json | jq -r '.failureMessage' +wb workflow job describe --job= --format=json | jq '.inputs' +wb workflow job describe --job= --format=json | jq '.outputs' +``` + +--- + +### Step 3: Find Failed Task & Get Logs + +```bash +wb workflow job task list --job= --format=json | jq '.[] | {name, status, exitCode}' +wb workflow job task describe --job= --task= --format=json +``` + +**Extract log URLs:** +```bash +TASK_INFO=$(wb workflow job task describe --job= --task= --format=json) +STDERR_URL=$(echo $TASK_INFO | jq -r '.stderr') +STDOUT_URL=$(echo $TASK_INFO | jq -r '.stdout') +echo "stderr: $STDERR_URL" +echo "stdout: $STDOUT_URL" +``` + +--- + +### Step 4: Pull and Analyze Task Logs + +#### Read Log Contents + +```bash +# Read stderr (usually contains errors) — logs are in S3 +aws s3 cp "$STDERR_URL" - 2>/dev/null | tail -100 + +# Read stdout +aws s3 cp "$STDOUT_URL" - 2>/dev/null | tail -100 + +# Search for common error patterns +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "error|exception|failed|denied|killed|oom|memory|disk|timeout" | head -30 +``` + +#### Common Log File Patterns + +Cromwell execution logs are typically at: +``` +s3://///execution/ +├── stdout # Task standard output +├── stderr # Task standard error +├── script # The actual command that ran +├── rc # Return code (exit code) +└── script.submit # Submission script +``` + +**One-liner to read all execution files:** +```bash +EXEC_DIR=$(echo $TASK_INFO | jq -r '.executionDirectory // empty') +if [ -n "$EXEC_DIR" ]; then + echo "=== script ===" && aws s3 cp "$EXEC_DIR/script" - 2>/dev/null + echo "=== rc ===" && aws s3 cp "$EXEC_DIR/rc" - 2>/dev/null + echo "=== stderr (last 50 lines) ===" && aws s3 cp "$EXEC_DIR/stderr" - 2>/dev/null | tail -50 +fi +``` + +--- + +### Step 5: Check Resource Allocation & Usage + +#### What Was Requested (from WDL runtime) + +```bash +wb workflow describe --workflow= --format=json | jq '.sourceUrl' + +# Read WDL file +aws s3 cp s3:////workflow.wdl - | grep -A10 "runtime {" +``` + +#### Check Actual Resource Usage (AWS Batch) + +```bash +# List failed AWS Batch jobs +aws batch list-jobs --job-queue --job-status FAILED \ + --query 'jobSummaryList[*].{id:jobId,name:jobName,status:status}' --output table + +# Describe specific batch job +aws batch describe-jobs --jobs | jq '.jobs[0] | { + status: .status, + statusReason: .statusReason, + container: .container.resourceRequirements +}' +``` + +#### Memory-Specific Checks + +```bash +# Check if OOM killed the task +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "oom|out of memory|killed|cannot allocate|memory" + +# Check what memory was requested in the batch job +aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements[] | select(.type=="MEMORY")' + +# Check for OOM kill signal in stderr +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i "killed process" +``` + +--- + +### Step 6: Diagnose by Error Type + +#### Memory Issues (OOM) + +**Symptoms:** +- Exit code 137 (SIGKILL) or 143 +- "Killed" in stderr +- "Cannot allocate memory" +- Task succeeded locally but fails at scale + +**Diagnosis:** +```bash +aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements' +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "memory|oom|killed|malloc" +``` + +**Fix:** Increase `memory` in WDL runtime block: +```wdl +runtime { + memory: "32G" +} +``` + +#### Disk Issues + +**Symptoms:** +- "No space left on device" +- "Disk quota exceeded" + +**Diagnosis:** +```bash +aws s3 cp "$STDERR_URL" - 2>/dev/null | grep -i -E "space|disk|quota" +``` + +**Fix:** Increase disk in WDL runtime: +```wdl +runtime { + disks: "local-disk 200 SSD" +} +``` + +#### Input File Issues + +**Symptoms:** +- "FileNotFoundException" +- "Localization failed" +- File not found errors + +**Diagnosis:** +```bash +wb workflow job describe --job= --format=json | jq -r '.inputs | to_entries[] | .value' | while read path; do + if [[ $path == s3://* ]]; then + echo -n "$path: " && aws s3 ls "$path" 2>&1 | head -1 + fi +done +``` + +#### Permission Issues + +**Symptoms:** +- "Permission denied" / "Access denied" / 403 errors + +**Diagnosis:** +```bash +# Check IAM role attached to batch job +aws batch describe-jobs --jobs | jq '.jobs[0].jobDefinition' + +# Test bucket access +aws s3 ls s3:/// 2>&1 | head -5 +``` + +--- + +### Step 7: Propose Solution + +| Issue | Solution Template | +|-------|-------------------| +| **OOM** | "Increase memory from X to Y in the runtime block" | +| **Disk full** | "Increase disk size from X to Y GB" | +| **Missing input** | "Input file doesn't exist. Verify path: `aws s3 ls `" | +| **Permission** | "IAM role lacks S3 access. Grant `s3:GetObject` on the bucket" | +| **Timeout** | "Task exceeded time limit. Increase `maxRetries` or optimize task" | +| **Docker** | "Image pull failed. Verify image exists and is accessible" | +| **Other** | Describe the root cause from logs and propose a fix based on the specific error | + +**Re-run after fixing:** +```bash +wb workflow job run --workflow= --inputs= +``` + +--- + +## Quick Reference + +### Essential Commands + +```bash +# Failed jobs +wb workflow job list --format=json | jq '.[] | select(.status=="FAILED") | {id, workflowName}' + +# Job error +wb workflow job describe --job= --format=json | jq '.failureMessage' + +# Failed tasks +wb workflow job task list --job= --format=json | jq '.[] | select(.status=="FAILED") | .name' + +# Task logs (S3) +wb workflow job task describe --job= --task= --format=json | jq -r '.stderr' | xargs -I{} aws s3 cp {} - | tail -50 + +# Memory check (AWS Batch) +aws batch describe-jobs --jobs | jq '.jobs[0].container.resourceRequirements' +``` + +### Error -> Cause -> Fix + +| Exit Code | Meaning | Common Fix | +|-----------|---------|------------| +| 1 | General error | Check stderr for details | +| 2 | Misuse of command | Check script syntax | +| 126 | Permission problem | Check file permissions | +| 127 | Command not found | Check PATH, container image | +| 137 | SIGKILL (OOM) | **Increase memory** | +| 139 | Segfault | Check input data, memory | +| 143 | SIGTERM | Task timeout or preemption | + +--- + +## Workbench-Specific Notes + +- **Log retention:** Cromwell logs persist in workspace execution bucket (S3) +- **Batch jobs:** Each sub-job has independent logs; troubleshoot specific failed sub-job +- **Preemption:** If using spot instances, set `preemptible: 0` for reliability diff --git a/features/src/wb-mcp-server/README.md b/features/src/wb-mcp-server/README.md index 6d464616f..23c7fb403 100644 --- a/features/src/wb-mcp-server/README.md +++ b/features/src/wb-mcp-server/README.md @@ -89,7 +89,7 @@ Uses `filter_build_attribute` and `cohort_create_in_workspace`. Uses `filter_build_attribute`, `filter_build_relationship`, `filter_build_boolean_logic`, and `cohort_create_in_workspace`. -## How It Works +## Internals ### Authentication - Auto-fetches bearer token from `wb auth print-access-token` diff --git a/features/src/wb-mcp-server/go.mod b/features/src/wb-mcp-server/go.mod index 62d3c9cbb..c1fe167a8 100644 --- a/features/src/wb-mcp-server/go.mod +++ b/features/src/wb-mcp-server/go.mod @@ -1,3 +1,3 @@ module github.com/verily-src/wb-mcp-server -go 1.21 +go 1.25 diff --git a/features/src/wb-mcp-server/install.sh b/features/src/wb-mcp-server/install.sh index 3a6fc132c..2d29dad3c 100755 --- a/features/src/wb-mcp-server/install.sh +++ b/features/src/wb-mcp-server/install.sh @@ -66,9 +66,14 @@ check_packages \ # Check if Go is installed if ! command -v go &> /dev/null; then - echo "Go is not installed. Installing Go 1.21..." - GOLANG_VERSION="1.21.6" - GOLANG_ARCH="amd64" + echo "Go is not installed. Installing Go 1.25..." + GOLANG_VERSION="1.25.0" + case "$(uname -m)" in + x86_64) GOLANG_ARCH="amd64" ;; + aarch64) GOLANG_ARCH="arm64" ;; + armv7l) GOLANG_ARCH="armv6l" ;; + *) echo "Unsupported architecture: $(uname -m)"; exit 1 ;; + esac cd "${WORKDIR}" curl -fsSL "https://go.dev/dl/go${GOLANG_VERSION}.linux-${GOLANG_ARCH}.tar.gz" -o go.tar.gz @@ -178,14 +183,16 @@ if command -v gemini &> /dev/null; then fi -# Add auto-start to .bashrc -{ - echo "" - echo "# Workbench MCP Server - auto-start" - echo "if ! pgrep -f 'wb-mcp-server -http' > /dev/null 2>&1; then" - echo " /opt/wb-mcp-server/start-server.sh > /dev/null 2>&1" - echo "fi" -} >> "${USER_HOME_DIR}/.bashrc" +# Add auto-start to .bashrc (idempotent) +if ! grep -q "# Workbench MCP Server - auto-start" "${USER_HOME_DIR}/.bashrc" 2>/dev/null; then + { + echo "" + echo "# Workbench MCP Server - auto-start" + echo "if ! pgrep -f 'wb-mcp-server -http' > /dev/null 2>&1; then" + echo " /opt/wb-mcp-server/start-server.sh > /dev/null 2>&1" + echo "fi" + } >> "${USER_HOME_DIR}/.bashrc" +fi chown "${USERNAME}:" "${USER_HOME_DIR}/.bashrc" diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index 2eb712d88..fc33e812b 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -1637,8 +1637,10 @@ func resolveWorkspaceId(workspaceId string) (string, error) { if !ok { continue } - if wsMap["userFacingId"].(string) == workspaceId || wsMap["id"].(string) == workspaceId { - return wsMap["id"].(string), nil + userFacingId, _ := wsMap["userFacingId"].(string) + id, _ := wsMap["id"].(string) + if userFacingId == workspaceId || id == workspaceId { + return id, nil } } return "", fmt.Errorf("workspace '%s' not found", workspaceId) @@ -1691,6 +1693,30 @@ func executeWbCommand(args []string) (string, error) { return string(output), err } +func requireString(args map[string]interface{}, key string) (string, error) { + val, ok := args[key] + if !ok || val == nil { + return "", fmt.Errorf("missing required parameter: %s", key) + } + s, ok := val.(string) + if !ok { + return "", fmt.Errorf("parameter %s must be a string, got %T", key, val) + } + return s, nil +} + +func requireStrings(args map[string]interface{}, keys ...string) ([]string, error) { + vals := make([]string, len(keys)) + for i, key := range keys { + v, err := requireString(args, key) + if err != nil { + return nil, err + } + vals[i] = v + } + return vals, nil +} + func handleCallTool(params CallToolParams) CallToolResult { var output string var err error @@ -2527,8 +2553,11 @@ func handleCallTool(params CallToolParams) CallToolResult { output = string(outputBytes) case "workspace_create": - id := params.Arguments["id"].(string) - podId := params.Arguments["podId"].(string) + vals, reqErr := requireStrings(params.Arguments, "id", "podId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + id, podId := vals[0], vals[1] args := []string{"workspace", "create", "--id=" + id, "--pod=" + podId} if name, ok := params.Arguments["name"].(string); ok { args = append(args, "--name="+name) @@ -2542,11 +2571,17 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "workspace_delete": - workspaceId := params.Arguments["workspaceId"].(string) + workspaceId, reqErr := requireString(params.Arguments, "workspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"workspace", "delete", "--workspace=" + workspaceId}) case "workspace_update": - workspaceId := params.Arguments["workspaceId"].(string) + workspaceId, reqErr := requireString(params.Arguments, "workspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } args := []string{"workspace", "update", "--workspace=" + workspaceId} if name, ok := params.Arguments["name"].(string); ok { args = append(args, "--name="+name) @@ -2557,8 +2592,11 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "workspace_duplicate": - sourceId := params.Arguments["sourceWorkspaceId"].(string) - destId := params.Arguments["destWorkspaceId"].(string) + vals, reqErr := requireStrings(params.Arguments, "sourceWorkspaceId", "destWorkspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + sourceId, destId := vals[0], vals[1] args := []string{"workspace", "duplicate", "--source-workspace=" + sourceId, "--destination-workspace-id=" + destId} if name, ok := params.Arguments["name"].(string); ok { args = append(args, "--name="+name) @@ -2566,34 +2604,46 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "workspace_set_property": - workspaceId := params.Arguments["workspaceId"].(string) - key := params.Arguments["key"].(string) - value := params.Arguments["value"].(string) - output, err = executeWbCommand([]string{"workspace", "set-property", "--workspace=" + workspaceId, "--key=" + key, "--value=" + value}) + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "key", "value") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workspace", "set-property", "--workspace=" + vals[0], "--key=" + vals[1], "--value=" + vals[2]}) case "workspace_delete_property": - workspaceId := params.Arguments["workspaceId"].(string) - key := params.Arguments["key"].(string) - output, err = executeWbCommand([]string{"workspace", "delete-property", "--workspace=" + workspaceId, "--key=" + key}) + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "key") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workspace", "delete-property", "--workspace=" + vals[0], "--key=" + vals[1]}) case "workspace_add_user": - workspaceId := params.Arguments["workspaceId"].(string) - email := params.Arguments["email"].(string) - role := params.Arguments["role"].(string) - output, err = executeWbCommand([]string{"workspace", "add-user", "--workspace=" + workspaceId, "--email=" + email, "--role=" + role}) + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "email", "role") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workspace", "add-user", "--workspace=" + vals[0], "--email=" + vals[1], "--role=" + vals[2]}) case "workspace_remove_user": - workspaceId := params.Arguments["workspaceId"].(string) - email := params.Arguments["email"].(string) - output, err = executeWbCommand([]string{"workspace", "remove-user", "--workspace=" + workspaceId, "--email=" + email}) + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "email") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workspace", "remove-user", "--workspace=" + vals[0], "--email=" + vals[1]}) case "workspace_list_users": - workspaceId := params.Arguments["workspaceId"].(string) + workspaceId, reqErr := requireString(params.Arguments, "workspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"workspace", "list-users", "--workspace=" + workspaceId}) case "resource_create_bucket": - resourceId := params.Arguments["resourceId"].(string) - bucketName := params.Arguments["bucketName"].(string) + vals, reqErr := requireStrings(params.Arguments, "resourceId", "bucketName") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + resourceId, bucketName := vals[0], vals[1] args := []string{"resource", "create", "gcs-bucket", "--id=" + resourceId, "--bucket-name=" + bucketName} if desc, ok := params.Arguments["description"].(string); ok { args = append(args, "--description="+desc) @@ -2601,8 +2651,11 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "resource_create_bq_dataset": - resourceId := params.Arguments["resourceId"].(string) - datasetId := params.Arguments["datasetId"].(string) + vals, reqErr := requireStrings(params.Arguments, "resourceId", "datasetId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + resourceId, datasetId := vals[0], vals[1] args := []string{"resource", "create", "bq-dataset", "--id=" + resourceId, "--dataset-id=" + datasetId} if desc, ok := params.Arguments["description"].(string); ok { args = append(args, "--description="+desc) @@ -2610,11 +2663,17 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "resource_delete": - resourceId := params.Arguments["resourceId"].(string) + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"resource", "delete", "--name=" + resourceId}) case "resource_update": - resourceId := params.Arguments["resourceId"].(string) + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } args := []string{"resource", "update", "--name=" + resourceId} if name, ok := params.Arguments["name"].(string); ok { args = append(args, "--new-name="+name) @@ -2625,9 +2684,11 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "resource_add_reference": - resourceId := params.Arguments["resourceId"].(string) - resourceType := params.Arguments["resourceType"].(string) - path := params.Arguments["path"].(string) + vals, reqErr := requireStrings(params.Arguments, "resourceId", "resourceType", "path") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + resourceId, resourceType, path := vals[0], vals[1], vals[2] args := []string{"resource", "add-ref", resourceType, "--name=" + resourceId, "--path=" + path} if desc, ok := params.Arguments["description"].(string); ok { args = append(args, "--description="+desc) @@ -2635,17 +2696,25 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "resource_check_access": - resourceId := params.Arguments["resourceId"].(string) + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"resource", "check-access", "--name=" + resourceId}) case "resource_move": - resourceId := params.Arguments["resourceId"].(string) - folderId := params.Arguments["folderId"].(string) - output, err = executeWbCommand([]string{"resource", "move", "--name=" + resourceId, "--folder-id=" + folderId}) + vals, reqErr := requireStrings(params.Arguments, "resourceId", "folderId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"resource", "move", "--name=" + vals[0], "--folder-id=" + vals[1]}) case "folder_create": - folderId := params.Arguments["folderId"].(string) - displayName := params.Arguments["displayName"].(string) + vals, reqErr := requireStrings(params.Arguments, "folderId", "displayName") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + folderId, displayName := vals[0], vals[1] args := []string{"folder", "create", "--id=" + folderId, "--display-name=" + displayName} if desc, ok := params.Arguments["description"].(string); ok { args = append(args, "--description="+desc) @@ -2656,11 +2725,17 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "folder_delete": - folderId := params.Arguments["folderId"].(string) + folderId, reqErr := requireString(params.Arguments, "folderId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"folder", "delete", "--id=" + folderId}) case "folder_update": - folderId := params.Arguments["folderId"].(string) + folderId, reqErr := requireString(params.Arguments, "folderId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } args := []string{"folder", "update", "--id=" + folderId} if displayName, ok := params.Arguments["displayName"].(string); ok { args = append(args, "--display-name="+displayName) @@ -2848,8 +2923,11 @@ func handleCallTool(params CallToolParams) CallToolResult { output = string(outputBytes) case "group_create": - groupId := params.Arguments["groupId"].(string) - name := params.Arguments["name"].(string) + vals, reqErr := requireStrings(params.Arguments, "groupId", "name") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + groupId, name := vals[0], vals[1] args := []string{"group", "create", "--id=" + groupId, "--name=" + name} if desc, ok := params.Arguments["description"].(string); ok { args = append(args, "--description="+desc) @@ -2857,30 +2935,42 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "group_delete": - groupId := params.Arguments["groupId"].(string) + groupId, reqErr := requireString(params.Arguments, "groupId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"group", "delete", "--id=" + groupId}) case "group_list": output, err = executeWbCommand([]string{"group", "list"}) case "group_describe": - groupId := params.Arguments["groupId"].(string) + groupId, reqErr := requireString(params.Arguments, "groupId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"group", "describe", "--id=" + groupId}) case "group_add_user": - groupId := params.Arguments["groupId"].(string) - email := params.Arguments["email"].(string) - role := params.Arguments["role"].(string) - output, err = executeWbCommand([]string{"group", "member", "add", "--group-id=" + groupId, "--email=" + email, "--role=" + role}) + vals, reqErr := requireStrings(params.Arguments, "groupId", "email", "role") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"group", "member", "add", "--group-id=" + vals[0], "--email=" + vals[1], "--role=" + vals[2]}) case "group_remove_user": - groupId := params.Arguments["groupId"].(string) - email := params.Arguments["email"].(string) - output, err = executeWbCommand([]string{"group", "member", "remove", "--group-id=" + groupId, "--email=" + email}) + vals, reqErr := requireStrings(params.Arguments, "groupId", "email") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"group", "member", "remove", "--group-id=" + vals[0], "--email=" + vals[1]}) case "app_create": - appId := params.Arguments["appId"].(string) - appConfig := params.Arguments["appConfig"].(string) + vals, reqErr := requireStrings(params.Arguments, "appId", "appConfig") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + appId, appConfig := vals[0], vals[1] args := []string{"app", "create", "gcp", "--id=" + appId, "--config=" + appConfig} if machineType, ok := params.Arguments["machineType"].(string); ok { args = append(args, "--machine-type="+machineType) @@ -2894,22 +2984,34 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "app_delete": - appId := params.Arguments["appId"].(string) + appId, reqErr := requireString(params.Arguments, "appId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"app", "delete", "--id=" + appId, "--quiet"}) case "app_list": output, err = executeWbCommand([]string{"app", "list"}) case "app_start": - appId := params.Arguments["appId"].(string) + appId, reqErr := requireString(params.Arguments, "appId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"app", "start", "--id=" + appId}) case "app_stop": - appId := params.Arguments["appId"].(string) + appId, reqErr := requireString(params.Arguments, "appId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"app", "stop", "--id=" + appId}) case "app_get_url": - appId := params.Arguments["appId"].(string) + appId, reqErr := requireString(params.Arguments, "appId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"app", "launch", "--id=" + appId}) case "auth_status": @@ -2919,47 +3021,61 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand([]string{"server", "list"}) case "server_set": - serverName := params.Arguments["serverName"].(string) + serverName, reqErr := requireString(params.Arguments, "serverName") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"server", "set", "--name=" + serverName}) case "server_status": output, err = executeWbCommand([]string{"server", "status"}) case "server_list_regions": - cloudPlatform := params.Arguments["cloudPlatform"].(string) + cloudPlatform, reqErr := requireString(params.Arguments, "cloudPlatform") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"server", "list-regions", "--platform=" + cloudPlatform}) case "pod_list": output, err = executeWbCommand([]string{"pod", "list"}) case "pod_describe": - podId := params.Arguments["podId"].(string) + podId, reqErr := requireString(params.Arguments, "podId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"pod", "describe", "--id=" + podId}) case "pod_role_list": - organizationId := params.Arguments["organizationId"].(string) - podId := params.Arguments["podId"].(string) - output, err = executeWbCommand([]string{"pod", "role", "list", "--organization=" + organizationId, "--pod=" + podId}) + vals, reqErr := requireStrings(params.Arguments, "organizationId", "podId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"pod", "role", "list", "--organization=" + vals[0], "--pod=" + vals[1]}) case "pod_role_grant": - organizationId := params.Arguments["organizationId"].(string) - podId := params.Arguments["podId"].(string) - email := params.Arguments["email"].(string) - role := params.Arguments["role"].(string) - output, err = executeWbCommand([]string{"pod", "role", "grant", "user", "--organization=" + organizationId, "--pod=" + podId, "--email=" + email, "--role=" + role}) + vals, reqErr := requireStrings(params.Arguments, "organizationId", "podId", "email", "role") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"pod", "role", "grant", "user", "--organization=" + vals[0], "--pod=" + vals[1], "--email=" + vals[2], "--role=" + vals[3]}) case "pod_role_revoke": - organizationId := params.Arguments["organizationId"].(string) - podId := params.Arguments["podId"].(string) - email := params.Arguments["email"].(string) - role := params.Arguments["role"].(string) - output, err = executeWbCommand([]string{"pod", "role", "revoke", "user", "--organization=" + organizationId, "--pod=" + podId, "--email=" + email, "--role=" + role}) + vals, reqErr := requireStrings(params.Arguments, "organizationId", "podId", "email", "role") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"pod", "role", "revoke", "user", "--organization=" + vals[0], "--pod=" + vals[1], "--email=" + vals[2], "--role=" + vals[3]}) case "organization_list": output, err = executeWbCommand([]string{"organization", "list"}) case "resource_credentials": - resourceId := params.Arguments["resourceId"].(string) + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } args := []string{"resource", "credentials", "--name=" + resourceId} if duration, ok := params.Arguments["duration"].(float64); ok { args = append(args, fmt.Sprintf("--duration=%d", int(duration))) @@ -2967,7 +3083,10 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "resource_open_console": - resourceId := params.Arguments["resourceId"].(string) + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"resource", "open-console", "--name=" + resourceId}) case "resource_list_tree": @@ -2980,38 +3099,60 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand([]string{"resource", "unmount"}) case "notebook_start": - notebookId := params.Arguments["notebookId"].(string) + notebookId, reqErr := requireString(params.Arguments, "notebookId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"notebook", "start", "--id=" + notebookId}) case "notebook_stop": - notebookId := params.Arguments["notebookId"].(string) + notebookId, reqErr := requireString(params.Arguments, "notebookId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"notebook", "stop", "--id=" + notebookId}) case "notebook_launch": - notebookId := params.Arguments["notebookId"].(string) + notebookId, reqErr := requireString(params.Arguments, "notebookId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"notebook", "launch", "--id=" + notebookId}) case "cluster_start": - clusterId := params.Arguments["clusterId"].(string) + clusterId, reqErr := requireString(params.Arguments, "clusterId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"cluster", "start", "--id=" + clusterId}) case "cluster_stop": - clusterId := params.Arguments["clusterId"].(string) + clusterId, reqErr := requireString(params.Arguments, "clusterId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"cluster", "stop", "--id=" + clusterId}) case "cluster_launch": - clusterId := params.Arguments["clusterId"].(string) + clusterId, reqErr := requireString(params.Arguments, "clusterId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"cluster", "launch", "--id=" + clusterId}) case "workflow_list": - workspaceId := params.Arguments["workspaceId"].(string) + workspaceId, reqErr := requireString(params.Arguments, "workspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"workflow", "list", "--workspace=" + workspaceId}) case "workflow_create": - workspaceId := params.Arguments["workspaceId"].(string) - workflowId := params.Arguments["workflowId"].(string) - bucketId := params.Arguments["bucketId"].(string) - path := params.Arguments["path"].(string) + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "workflowId", "bucketId", "path") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + workspaceId, workflowId, bucketId, path := vals[0], vals[1], vals[2], vals[3] args := []string{"workflow", "create", "--workspace=" + workspaceId, "--workflow=" + workflowId, "--bucket-id=" + bucketId, "--path=" + path} if displayName, ok := params.Arguments["displayName"].(string); ok { args = append(args, "--display-name="+displayName) @@ -3022,22 +3163,28 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "workflow_describe": - workspaceId := params.Arguments["workspaceId"].(string) - workflowId := params.Arguments["workflowId"].(string) - output, err = executeWbCommand([]string{"workflow", "describe", "--workspace=" + workspaceId, "--workflow=" + workflowId}) + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "workflowId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workflow", "describe", "--workspace=" + vals[0], "--workflow=" + vals[1]}) case "workflow_job_list": output, err = executeWbCommand([]string{"workflow", "job", "list"}) case "workflow_job_describe": - workspaceId := params.Arguments["workspaceId"].(string) - jobId := params.Arguments["jobId"].(string) - output, err = executeWbCommand([]string{"workflow", "job", "describe", "--workspace=" + workspaceId, "--job-id=" + jobId}) + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "jobId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workflow", "job", "describe", "--workspace=" + vals[0], "--job-id=" + vals[1]}) case "workflow_job_run": - workspaceId := params.Arguments["workspaceId"].(string) - workflowId := params.Arguments["workflowId"].(string) - outputBucketId := params.Arguments["outputBucketId"].(string) + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "workflowId", "outputBucketId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + workspaceId, workflowId, outputBucketId := vals[0], vals[1], vals[2] args := []string{"workflow", "job", "run", "--workspace=" + workspaceId, "--workflow=" + workflowId, "--output-bucket-id=" + outputBucketId} if jobId, ok := params.Arguments["jobId"].(string); ok { args = append(args, "--job-id="+jobId) @@ -3055,39 +3202,62 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand(args) case "workflow_job_cancel": - workspaceId := params.Arguments["workspaceId"].(string) - jobId := params.Arguments["jobId"].(string) - output, err = executeWbCommand([]string{"workflow", "job", "cancel", "--workspace=" + workspaceId, "--job-id=" + jobId}) + vals, reqErr := requireStrings(params.Arguments, "workspaceId", "jobId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } + output, err = executeWbCommand([]string{"workflow", "job", "cancel", "--workspace=" + vals[0], "--job-id=" + vals[1]}) case "cromwell_generate_config": - path := params.Arguments["path"].(string) + path, reqErr := requireString(params.Arguments, "path") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"cromwell", "generate-config", "--path=" + path}) case "workspace_configure_aws": - workspaceId := params.Arguments["workspaceId"].(string) + workspaceId, reqErr := requireString(params.Arguments, "workspaceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"workspace", "configure-aws", "--workspace=" + workspaceId}) case "resolve": - resourceId := params.Arguments["resourceId"].(string) + resourceId, reqErr := requireString(params.Arguments, "resourceId") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand([]string{"resolve", "--name=" + resourceId}) case "version": output, err = executeWbCommand([]string{"version"}) case "bq_execute": - command := params.Arguments["command"].(string) + command, reqErr := requireString(params.Arguments, "command") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand(append([]string{"bq"}, strings.Fields(command)...)) case "gcloud_execute": - command := params.Arguments["command"].(string) + command, reqErr := requireString(params.Arguments, "command") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand(append([]string{"gcloud"}, strings.Fields(command)...)) case "gsutil_execute": - command := params.Arguments["command"].(string) + command, reqErr := requireString(params.Arguments, "command") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand(append([]string{"gsutil"}, strings.Fields(command)...)) case "git_execute": - command := params.Arguments["command"].(string) + command, reqErr := requireString(params.Arguments, "command") + if reqErr != nil { + return CallToolResult{Content: []ContentItem{{Type: "text", Text: "Error: " + reqErr.Error()}}, IsError: true} + } output, err = executeWbCommand(append([]string{"git"}, strings.Fields(command)...)) default: diff --git a/src/workbench-jupyter-with-llm/README.md b/src/workbench-jupyter-with-llm/README.md index 0823620e6..31b8b6272 100644 --- a/src/workbench-jupyter-with-llm/README.md +++ b/src/workbench-jupyter-with-llm/README.md @@ -9,8 +9,6 @@ Workbench JupyterLab with integrated AI assistance through Gemini CLI, Claude CL |-----|-----|-----|-----| | cloud | VM cloud environment | string | gcp | | login | Whether to log in to workbench CLI | string | false | -| containerImage | The container image to use | string | debian:bullseye | -| containerPort | The port to expose the container on | number | 8888 | From 77da25e61f0cb11966546ab7111df57beea0667b Mon Sep 17 00:00:00 2001 From: David Shen Date: Wed, 13 May 2026 17:00:18 -0400 Subject: [PATCH 78/86] vscode-with-llm --- src/vscode-with-llm/.devcontainer.json | 71 +++++++++++++++++++ src/vscode-with-llm/Dockerfile | 28 ++++++++ src/vscode-with-llm/README.md | 17 +++++ .../devcontainer-template.json | 23 ++++++ src/vscode-with-llm/docker-compose.yaml | 30 ++++++++ src/vscode-with-llm/sudo-passwordless.sh | 34 +++++++++ 6 files changed, 203 insertions(+) create mode 100644 src/vscode-with-llm/.devcontainer.json create mode 100644 src/vscode-with-llm/Dockerfile create mode 100644 src/vscode-with-llm/README.md create mode 100644 src/vscode-with-llm/devcontainer-template.json create mode 100644 src/vscode-with-llm/docker-compose.yaml create mode 100755 src/vscode-with-llm/sudo-passwordless.sh diff --git a/src/vscode-with-llm/.devcontainer.json b/src/vscode-with-llm/.devcontainer.json new file mode 100644 index 000000000..b4d50a4b8 --- /dev/null +++ b/src/vscode-with-llm/.devcontainer.json @@ -0,0 +1,71 @@ +{ + "name": "vscode with LLM tools", + "dockerComposeFile": "docker-compose.yaml", + "service": "app", + "shutdownAction": "none", + "workspaceFolder": "/workspace", + "postCreateCommand": + "./startupscript/post-startup.sh abc /config \"${templateOption:cloud}\" \"${templateOption:login}\"; ./sudo-passwordless.sh abc", + // re-mount bucket files on container start up, then generate LLM context + "postStartCommand": "./startupscript/remount-on-restart.sh abc /config \"${templateOption:cloud}\" \"${templateOption:login}\"; /opt/llm-context/generate-context.sh /config || true", + "features": { + "ghcr.io/devcontainers/features/java@sha256:9663ce0219ff85786e87901ce5f0a59f488edd5f99b46015192cda48468b233a": { + "version": "17" + }, + "ghcr.io/devcontainers/features/node@sha256:8c0de46939b61958041700ee89e3493f3b2e4131a06dc46b4d9423427d06e5f6": { + "version": "24.11.0" + }, + "ghcr.io/devcontainers/features/aws-cli@sha256:1f93c8315b7a6d76982ebb2269f8b0d50413fc0f965c032edf4aee0caceb73ef": {}, + "ghcr.io/dhoeric/features/google-cloud-cli@sha256:fa5d894718825c5ad8009ac8f2c9f0cea3d1661eb108a9d465cba9f3fc48965f": {}, + "ghcr.io/anthropics/devcontainer-features/claude-code@sha256:cfc2e7d3e9fd3b9b01f8d5cb158508a884c8c0ede2e23ed10f32dea5d4ffe69a": {}, + "./.devcontainer/features/gemini-cli": { "username": "abc" }, + "./.devcontainer/features/workbench-tools": { + "cloud": "${templateOption:cloud}", + "username": "abc", + "userHomeDir": "/config" + }, + "./.devcontainer/features/postgres-client": { + "version": "16" + }, + "./.devcontainer/features/wb-mcp-server": { + "username": "abc", + "userHomeDir": "/config" + }, + "./.devcontainer/features/llm-context": { + "username": "abc", + "userHomeDir": "/config" + } + }, + "remoteUser": "root", + "customizations": { + "workbench": { + "opens": { + "extensions": [ + // Source. + ".c", + ".cjs", + ".cpp", + ".go", + ".java", + ".js", + ".mjs", + ".php", + ".scala", + ".sh", + ".ts", + // Documents + ".md", + ".html", + // Data + ".csv", + ".json", + ".jsonc", + ".tsv", + ".xml", + ".yml" + ], + "fileUrlSuffix": "?payload=[[\"openFile\",\"vscode-remote:///config/{path}\"]]" + } + } + } +} diff --git a/src/vscode-with-llm/Dockerfile b/src/vscode-with-llm/Dockerfile new file mode 100644 index 000000000..eed456d8a --- /dev/null +++ b/src/vscode-with-llm/Dockerfile @@ -0,0 +1,28 @@ +FROM lscr.io/linuxserver/code-server@sha256:7bd334657f13505abc1e20afeeee5670ad8f818e68853c810889184e597f3051 + +# Gemini: https://open-vsx.org/extension/Google/geminicodeassist +# Claude: https://open-vsx.org/extension/Anthropic/claude-code +RUN apt-get update \ + && apt-get install -y --no-install-recommends jq \ + && rm -rf /var/lib/apt/lists/* \ + && mkdir -p /config/extensions \ + && chown abc:abc /config /config/extensions + +USER abc +ENV HOME=/config + +RUN curl -fsSL 'https://open-vsx.org/api/Google/geminicodeassist/2.79.0' \ + | jq -r '.files.download' \ + | xargs curl -fL --compressed -o /tmp/geminicodeassist.vsix \ + && curl -fsSL 'https://open-vsx.org/api/Anthropic/claude-code/linux-x64/2.1.128' \ + | jq -r '.files.download' \ + | xargs curl -fL --compressed -o /tmp/claudecode.vsix \ + && /app/code-server/bin/code-server --extensions-dir /config/extensions --install-extension /tmp/geminicodeassist.vsix \ + && /app/code-server/bin/code-server --extensions-dir /config/extensions --install-extension /tmp/claudecode.vsix \ + && rm /tmp/geminicodeassist.vsix /tmp/claudecode.vsix \ + && mkdir -p /config/data/User \ + && echo '{"http.systemCertificatesNode":true}' > /config/data/User/settings.json + +USER root + +WORKDIR /config diff --git a/src/vscode-with-llm/README.md b/src/vscode-with-llm/README.md new file mode 100644 index 000000000..b8403896e --- /dev/null +++ b/src/vscode-with-llm/README.md @@ -0,0 +1,17 @@ + +# Vscode (vscode) + +A Template to run vscode on workbench + +## Options + +| Options Id | Description | Type | Default Value | +|-----|-----|-----|-----| +| cloud | VM cloud environment | string | gcp | +| login | Whether to log in to workbench CLI | string | false | + + + +--- + +_Note: This file was auto-generated from the [devcontainer-template.json](https://github.com/verily-src/workbench-app-devcontainers/blob/main/src/vscode/devcontainer-template.json). Add additional notes to a `NOTES.md`._ diff --git a/src/vscode-with-llm/devcontainer-template.json b/src/vscode-with-llm/devcontainer-template.json new file mode 100644 index 000000000..233ce2c34 --- /dev/null +++ b/src/vscode-with-llm/devcontainer-template.json @@ -0,0 +1,23 @@ +{ + "id": "vscode-with-llm", + "version": "0.0.1", + "name": "Vscode with LLM tools", + "description": "A Template to run vscode with LLM tools on workbench", + "documentationURL": "https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/vscode-with-llm", + "licenseURL": "https://github.com/verily-src/workbench-app-devcontainers/blob/master/LICENSE", + "options": { + "cloud": { + "type": "string", + "description": "VM cloud environment", + "proposals": ["gcp", "aws"], + "default": "gcp" + }, + "login": { + "type": "string", + "description": "Whether to log in to workbench CLI", + "proposals": ["true", "false"], + "default": "false" + } + }, + "platforms": ["Any"] +} diff --git a/src/vscode-with-llm/docker-compose.yaml b/src/vscode-with-llm/docker-compose.yaml new file mode 100644 index 000000000..01f6af644 --- /dev/null +++ b/src/vscode-with-llm/docker-compose.yaml @@ -0,0 +1,30 @@ +version: "2.4" +services: + app: + container_name: "application-server" + build: + context: . + dockerfile: Dockerfile + restart: always + volumes: + - .:/workspace:cached + - work:/home/vscode:cached + ports: + - "8443:8443" + environment: + USER: "abc" + DEFAULT_WORKSPACE: "/config" + SUDO_PASSWORD: "pwd" + networks: + - app-network + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + security_opt: + - apparmor:unconfined +networks: + app-network: + external: true +volumes: + work: diff --git a/src/vscode-with-llm/sudo-passwordless.sh b/src/vscode-with-llm/sudo-passwordless.sh new file mode 100755 index 000000000..d14bd0a17 --- /dev/null +++ b/src/vscode-with-llm/sudo-passwordless.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# This script is used to set up passwordless sudo for the core user on the VM. +# It requires to be run with root priviledges and USER_NAME to be set in the environment. +# It is typically called from post-startup.sh. + +USER_NAME="${1}" + +if [[ -z "${USER_NAME}" ]]; then + echo "Usage: $0 " + exit 1 +fi + +sudoers_file="/etc/sudoers" +sudoers_d_file="/etc/sudoers.d/${USER_NAME}" + +# Make sure user exists +if ! id "${USER_NAME}" &>/dev/null; then + echo "User ${USER_NAME} does not exist." + exit 1 +fi + +# Check if there's an old rule in the main sudoers file that requires a password +if grep -q "^${USER_NAME} ALL=(ALL:ALL) ALL" "${sudoers_file}"; then + echo "Found password-requiring rule for ${USER_NAME} in /etc/sudoers. Commenting it out." + + # Comment out the old rule in /etc/sudoers + sed -i "s/^${USER_NAME} ALL=(ALL:ALL) ALL/# ${USER_NAME} ALL=(ALL:ALL) ALL/" "${sudoers_file}" +fi + +echo "${USER_NAME} ALL=(ALL) NOPASSWD:ALL" > "${sudoers_d_file}" +chmod 440 "${sudoers_d_file}" + +echo "User ${USER_NAME} has been given passwordless sudo access." From 1571e53c8b9d924ff049fcf80f844e5fd1aaddae Mon Sep 17 00:00:00 2001 From: David Shen Date: Thu, 14 May 2026 10:23:06 -0400 Subject: [PATCH 79/86] Reference local templates directory --- features/src/llm-context/install.sh | 9 ++++ .../src/llm-context/skills/APP_TEMPLATES.md | 41 +++++++++---------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh index 5acb30757..1684b708f 100644 --- a/features/src/llm-context/install.sh +++ b/features/src/llm-context/install.sh @@ -90,6 +90,15 @@ else echo "Warning: skills directory not found in ${FEATURE_DIR}" fi +# Copy app templates to installation directory +if [[ -d "${FEATURE_DIR}/templates" ]]; then + mkdir -p "${LLM_CONTEXT_DIR}/templates" + cp -r "${FEATURE_DIR}/templates/." "${LLM_CONTEXT_DIR}/templates/" + echo "Copied app templates to ${LLM_CONTEXT_DIR}/templates" +else + echo "Warning: templates directory not found in ${FEATURE_DIR}" +fi + # Create a wrapper script that runs with proper user context cat > "${LLM_CONTEXT_DIR}/run-context-generator.sh" << WRAPPER_EOF #!/bin/bash diff --git a/features/src/llm-context/skills/APP_TEMPLATES.md b/features/src/llm-context/skills/APP_TEMPLATES.md index 00de407eb..8e3cb7e66 100644 --- a/features/src/llm-context/skills/APP_TEMPLATES.md +++ b/features/src/llm-context/skills/APP_TEMPLATES.md @@ -47,41 +47,38 @@ ## Template Locations -The official app repository contains reference implementations and examples: +All templates are bundled locally at `/opt/llm-context/templates/`: ``` -https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/ +/opt/llm-context/templates/ +├── flask-api/ +├── streamlit-dashboard/ +├── rshiny-dashboard/ +├── file-processor/ +└── README.md ``` -Good starting points: -- [`example`](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/example) — Minimal standalone app (ttyd terminal) -- [`workbench-vscode`](https://github.com/verily-src/workbench-app-devcontainers/tree/master/src/workbench-vscode) — Full-featured VS Code Server - -Each app contains: +Each template contains: +- `manifest.yaml` - Capabilities and inputs - `.devcontainer.json` - Devcontainer config - `docker-compose.yaml` - Container setup - `Dockerfile` - Build instructions -- `devcontainer-template.json` - Template metadata -- Application code +- `app/` - Application code +- `README.md` - Documentation --- ## How to Use a Template -### Recommended: Fork and Customize - -The official repo (`verily-src/workbench-app-devcontainers`) is a curated collection of common/default apps. **Create a fork** for your custom app rather than submitting a PR to the org repo: +### Option 1: Deploy Directly -1. Fork https://github.com/verily-src/workbench-app-devcontainers -2. Copy an existing app folder (e.g., `src/example`) to `src/my-app` -3. Modify application code -4. Update `devcontainer-template.json` with new name/description -5. Push to your fork -6. Deploy from your fork's repo URL +Read the template files from `/opt/llm-context/templates//` and copy them into the user's repository to deploy. -### Alternative: Standalone Repo -1. Copy the template files to a new repository -2. Ensure `.devcontainer.json` is at the repo root -3. Push to GitHub and deploy from your repo +### Option 2: Copy and Customize +1. Copy the template folder to user's repo +2. Modify application code in `app/` +3. Update `devcontainer-template.json` with new name/description +4. Push to GitHub +5. Deploy from user's repo > ⚠️ Volume mounts (`volumes: .:/workspace`) are for local dev only. In production, Workbench builds the image — code must be baked in via `COPY` in the Dockerfile. Do not rely on volume mounts for deployed apps. From fd1595ddaae03b10f39bd026554650f8705b5451 Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 14 May 2026 10:50:53 -0400 Subject: [PATCH 80/86] fix(mcp): resolve workspace UUID correctly at startup using userFacingId ws[id] from wb status is the userFacingId (e.g. test-1), not the UUID. Call resolveWorkspaceId once at startup after workspaceBaseURL is set, cache the resulting UUID, and reuse it in workspace_list_data_collections. Co-authored-by: Cursor --- features/src/wb-mcp-server/main.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index 00763a420..17762ad20 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -1595,11 +1595,20 @@ func initializeConfig() error { } else { fmt.Fprintf(os.Stderr, "Warning: server info not found in wb status, using default URLs\n") } - // Cache the current workspace UUID to avoid resolveWorkspaceId calls at runtime + // Cache the workspace UUID at startup to avoid resolveWorkspaceId calls at runtime. + // ws["id"] from wb status is the userFacingId, not the UUID — resolve it once here. if ws, ok := status["workspace"].(map[string]interface{}); ok { - if uuid, ok := ws["id"].(string); ok && uuid != "" { - cachedWorkspaceUUID = uuid - fmt.Fprintf(os.Stderr, "Cached workspace UUID: %s\n", uuid) + userFacingId, _ := ws["userFacingId"].(string) + if userFacingId == "" { + userFacingId, _ = ws["id"].(string) + } + if userFacingId != "" { + if uuid, resolveErr := resolveWorkspaceId(userFacingId); resolveErr == nil { + cachedWorkspaceUUID = uuid + fmt.Fprintf(os.Stderr, "Cached workspace UUID: %s\n", uuid) + } else { + fmt.Fprintf(os.Stderr, "Warning: could not resolve workspace UUID at startup: %v\n", resolveErr) + } } } } From c41a6cea10ce21f21421195fd1b94df438522dbb Mon Sep 17 00:00:00 2001 From: aculotti-verily Date: Thu, 14 May 2026 11:58:17 -0400 Subject: [PATCH 81/86] fix: robust 3-layer workspace UUID resolution for workspace_list_data_collections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root causes of repeated failures: 1. UUID cached only at startup — any timing issue made it permanently empty 2. resolveWorkspaceId fetched all 5000 workspaces — slow and failure-prone 3. No recovery path if startup resolution failed New getCurrentWorkspaceUUID() — called lazily at each tool invocation: Layer 1: return cachedWorkspaceUUID if already set (instant) Layer 2: wb workspace describe --format=json — direct CLI call, checks for uuid field, or id if it is UUID format, or userFacingId Layer 3: wb status --format=json for userFacingId fallback, then workspace list with limit=100 first (avoids 5000-item scan for most users), expanding to 5000 only if not found in first page resolveWorkspaceId() kept (with same small-page-first improvement) for tools that resolve user-supplied workspaceId parameters. workspace_list_data_collections now returns a readable guidance message instead of a hard error so Claude can relay fix instructions to the user. Also fixed: UUID cache call in initializeConfig was inside the wrong else-branch (server-not-found), now correctly at the outer level. Co-authored-by: Cursor --- features/src/wb-mcp-server/main.go | 203 +++++++++++++++++++++++------ 1 file changed, 161 insertions(+), 42 deletions(-) diff --git a/features/src/wb-mcp-server/main.go b/features/src/wb-mcp-server/main.go index 17762ad20..0abd5ce9d 100644 --- a/features/src/wb-mcp-server/main.go +++ b/features/src/wb-mcp-server/main.go @@ -1595,21 +1595,10 @@ func initializeConfig() error { } else { fmt.Fprintf(os.Stderr, "Warning: server info not found in wb status, using default URLs\n") } - // Cache the workspace UUID at startup to avoid resolveWorkspaceId calls at runtime. - // ws["id"] from wb status is the userFacingId, not the UUID — resolve it once here. - if ws, ok := status["workspace"].(map[string]interface{}); ok { - userFacingId, _ := ws["userFacingId"].(string) - if userFacingId == "" { - userFacingId, _ = ws["id"].(string) - } - if userFacingId != "" { - if uuid, resolveErr := resolveWorkspaceId(userFacingId); resolveErr == nil { - cachedWorkspaceUUID = uuid - fmt.Fprintf(os.Stderr, "Cached workspace UUID: %s\n", uuid) - } else { - fmt.Fprintf(os.Stderr, "Warning: could not resolve workspace UUID at startup: %v\n", resolveErr) - } - } + // Best-effort workspace UUID cache at startup. If this fails (e.g. auth not + // ready yet), getCurrentWorkspaceUUID() will retry lazily at call time. + if _, startupErr := getCurrentWorkspaceUUID(); startupErr != nil { + fmt.Fprintf(os.Stderr, "Warning: could not resolve workspace UUID at startup (will retry on first use): %v\n", startupErr) } } } @@ -1626,41 +1615,171 @@ func initializeConfig() error { return nil } -func getToken() (string, error) { - cmd := exec.Command("wb", "auth", "print-access-token") - output, err := cmd.CombinedOutput() - if err != nil { - return "", fmt.Errorf("failed to get access token: %v", err) +// resolveWorkspaceId resolves an arbitrary user-facing workspace ID to its UUID +// by searching the full workspace list. Used by tools that accept an explicit +// workspaceId parameter. For the CURRENT workspace, use getCurrentWorkspaceUUID(). +func resolveWorkspaceId(workspaceId string) (string, error) { + if isUUID(workspaceId) { + return workspaceId, nil // already a UUID } - return strings.TrimSpace(string(output)), nil + for _, limit := range []int{100, 5000} { + listUrl := fmt.Sprintf("%s/api/workspaces/v1?offset=0&limit=%d", workspaceBaseURL, limit) + listResp, apiErr := makeAPIRequest("GET", listUrl, nil) + if apiErr != nil { + continue + } + var listData map[string]interface{} + if json.Unmarshal(listResp, &listData) != nil { + continue + } + workspaces, _ := listData["workspaces"].([]interface{}) + for _, ws := range workspaces { + wsMap, ok := ws.(map[string]interface{}) + if !ok { + continue + } + ufid, _ := wsMap["userFacingId"].(string) + id, _ := wsMap["id"].(string) + if ufid == workspaceId || id == workspaceId { + return id, nil + } + } + } + return "", fmt.Errorf("workspace '%s' not found", workspaceId) } -func resolveWorkspaceId(workspaceId string) (string, error) { - listUrl := fmt.Sprintf("%s/api/workspaces/v1?offset=0&limit=5000", workspaceBaseURL) - listResp, apiErr := makeAPIRequest("GET", listUrl, nil) - if apiErr != nil { - return "", fmt.Errorf("failed to list workspaces: %w", apiErr) +// isUUID returns true if s looks like a UUID (8-4-4-4-12 hex format). +func isUUID(s string) bool { + if len(s) != 36 { + return false + } + for i, c := range s { + if i == 8 || i == 13 || i == 18 || i == 23 { + if c != '-' { + return false + } + } else if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { + return false + } + } + return true +} + +// getCurrentWorkspaceUUID returns the UUID of the currently active workspace. +// It uses a three-layer strategy so that temporary failures (auth not ready, +// server startup race) do not permanently break workspace-scoped tools: +// +// 1. Return the cached UUID if already resolved. +// 2. Call `wb workspace describe --format=json` — fast, no list traversal needed. +// If the response contains a `uuid` field, use it directly. +// If not, extract the `id` / `userFacingId` and proceed to layer 3. +// 3. Search the workspace list with a small page (100) first, then full (5000), +// using the userFacingId obtained from layer 2. +// +// The result is cached so subsequent calls within the same server session are instant. +func getCurrentWorkspaceUUID() (string, error) { + if cachedWorkspaceUUID != "" { + return cachedWorkspaceUUID, nil + } + + // Layer 1: wb workspace describe — most direct path. + userFacingId := "" + cmd := exec.Command("wb", "workspace", "describe", "--format=json") + if out, err := cmd.CombinedOutput(); err == nil { + var desc map[string]interface{} + if json.Unmarshal(out, &desc) == nil { + // Some Workbench versions return uuid directly. + if uuid, ok := desc["uuid"].(string); ok && isUUID(uuid) { + cachedWorkspaceUUID = uuid + fmt.Fprintf(os.Stderr, "Resolved workspace UUID from describe: %s\n", uuid) + return uuid, nil + } + // id may be the UUID on some versions, or userFacingId on others. + if id, ok := desc["id"].(string); ok { + if isUUID(id) { + cachedWorkspaceUUID = id + fmt.Fprintf(os.Stderr, "Resolved workspace UUID from describe.id: %s\n", id) + return id, nil + } + userFacingId = id + } + // Explicit userFacingId field takes precedence if present. + if ufid, ok := desc["userFacingId"].(string); ok && ufid != "" { + userFacingId = ufid + } + } } - var listData map[string]interface{} - if err := json.Unmarshal(listResp, &listData); err != nil { - return "", fmt.Errorf("error parsing workspace list: %v", err) + + // Layer 2: fall back to wb status for userFacingId if describe didn't give it. + if userFacingId == "" { + cmd2 := exec.Command("wb", "status", "--format=json") + if out, err := cmd2.CombinedOutput(); err == nil { + var status map[string]interface{} + if json.Unmarshal(out, &status) == nil { + if ws, ok := status["workspace"].(map[string]interface{}); ok { + if ufid, ok := ws["userFacingId"].(string); ok && ufid != "" { + userFacingId = ufid + } else if id, ok := ws["id"].(string); ok { + if isUUID(id) { + cachedWorkspaceUUID = id + return id, nil + } + userFacingId = id + } + } + } + } } - workspaces, ok := listData["workspaces"].([]interface{}) - if !ok { - return "", fmt.Errorf("workspaces not found in list response") + + if userFacingId == "" { + return "", fmt.Errorf("no active workspace found — run `wb workspace set --id=` first") } - for _, ws := range workspaces { - wsMap, ok := ws.(map[string]interface{}) + + // Layer 3: resolve userFacingId → UUID via workspace list. + // Try a small page first to avoid fetching 5,000 workspaces for common cases. + for _, limit := range []int{100, 5000} { + listUrl := fmt.Sprintf("%s/api/workspaces/v1?offset=0&limit=%d", workspaceBaseURL, limit) + listResp, apiErr := makeAPIRequest("GET", listUrl, nil) + if apiErr != nil { + continue + } + var listData map[string]interface{} + if json.Unmarshal(listResp, &listData) != nil { + continue + } + workspaces, ok := listData["workspaces"].([]interface{}) if !ok { continue } - if wsMap["userFacingId"].(string) == workspaceId || wsMap["id"].(string) == workspaceId { - return wsMap["id"].(string), nil + for _, w := range workspaces { + wsMap, ok := w.(map[string]interface{}) + if !ok { + continue + } + ufid, _ := wsMap["userFacingId"].(string) + id, _ := wsMap["id"].(string) + if ufid == userFacingId || id == userFacingId { + // id in the workspace list API is always the UUID. + cachedWorkspaceUUID = id + fmt.Fprintf(os.Stderr, "Resolved workspace UUID from list (limit=%d): %s\n", limit, id) + return id, nil + } } } - return "", fmt.Errorf("workspace '%s' not found", workspaceId) + + return "", fmt.Errorf("workspace '%s' not found in accessible workspaces", userFacingId) +} + +func getToken() (string, error) { + cmd := exec.Command("wb", "auth", "print-access-token") + output, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("failed to get access token: %v", err) + } + return strings.TrimSpace(string(output)), nil } + func makeAPIRequest(method, url string, body interface{}) ([]byte, error) { token, err := getToken() if err != nil { @@ -2691,13 +2810,13 @@ func handleCallTool(params CallToolParams) CallToolResult { output, err = executeWbCommand([]string{"folder", "tree"}) case "workspace_list_data_collections": - // Use the workspace UUID cached at startup — avoids calling wb status and - // resolveWorkspaceId (which fetches all 5000 workspaces) on every tool call. - if cachedWorkspaceUUID == "" { - err = fmt.Errorf("workspace UUID not available — MCP server may not have a workspace set at startup") + var workspaceUuid string + var uuidErr error + workspaceUuid, uuidErr = getCurrentWorkspaceUUID() + if uuidErr != nil { + output = fmt.Sprintf("Could not determine active workspace: %v\n\nTo fix: run `wb workspace set --id=` in your terminal, then retry.", uuidErr) break } - workspaceUuid := cachedWorkspaceUUID // List all resources (same API call as workspace_list_resources which works) resourcesUrl := fmt.Sprintf("%s/api/workspaces/v1/%s/resources?offset=0&limit=1000", workspaceBaseURL, workspaceUuid) From 3fc9cd6de623de214dda8b582f914c197571ad22 Mon Sep 17 00:00:00 2001 From: David Shen Date: Thu, 14 May 2026 13:24:38 -0400 Subject: [PATCH 82/86] Remove unused variable --- features/src/llm-context/install.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh index 1684b708f..a9ea0102e 100644 --- a/features/src/llm-context/install.sh +++ b/features/src/llm-context/install.sh @@ -127,7 +127,6 @@ chmod +x "${LLM_CONTEXT_DIR}/run-context-generator.sh" # Set ownership chown -R "${USERNAME}:" "${LLM_CONTEXT_DIR}" 2>/dev/null || true -chown -R "${USERNAME}:" "${USER_WORKBENCH_DIR}" 2>/dev/null || true # Add aliases and environment to bashrc (idempotent) if ! grep -q "# LLM Context Generator" "${USER_HOME_DIR}/.bashrc" 2>/dev/null; then From e3156867a328ab81803779bd7b227df9c8b4e0aa Mon Sep 17 00:00:00 2001 From: David Shen Date: Thu, 14 May 2026 16:55:03 -0400 Subject: [PATCH 83/86] shellcheck --- .../templates/streamlit-dashboard/startupscript/post-startup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/features/src/llm-context/templates/streamlit-dashboard/startupscript/post-startup.sh b/features/src/llm-context/templates/streamlit-dashboard/startupscript/post-startup.sh index 9ec9e1b35..7376dbedd 100755 --- a/features/src/llm-context/templates/streamlit-dashboard/startupscript/post-startup.sh +++ b/features/src/llm-context/templates/streamlit-dashboard/startupscript/post-startup.sh @@ -6,7 +6,7 @@ set -o pipefail set -o xtrace echo "=== POST-STARTUP.SH STARTING ===" -echo "Arguments: $@" +echo "Arguments: $*" if [[ $# -ne 4 ]]; then echo "Usage: $0 user workDirectory " From bc56c3d5a8889041bb1d313e087ae120f1df1cdc Mon Sep 17 00:00:00 2001 From: David Shen Date: Fri, 15 May 2026 07:30:07 -0400 Subject: [PATCH 84/86] Use run-context-generator.sh instead of generate-context.sh to wait for wb to be ready --- src/vscode-with-llm/.devcontainer.json | 2 +- src/workbench-jupyter-with-llm/.devcontainer.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vscode-with-llm/.devcontainer.json b/src/vscode-with-llm/.devcontainer.json index b4d50a4b8..aac38e47f 100644 --- a/src/vscode-with-llm/.devcontainer.json +++ b/src/vscode-with-llm/.devcontainer.json @@ -7,7 +7,7 @@ "postCreateCommand": "./startupscript/post-startup.sh abc /config \"${templateOption:cloud}\" \"${templateOption:login}\"; ./sudo-passwordless.sh abc", // re-mount bucket files on container start up, then generate LLM context - "postStartCommand": "./startupscript/remount-on-restart.sh abc /config \"${templateOption:cloud}\" \"${templateOption:login}\"; /opt/llm-context/generate-context.sh /config || true", + "postStartCommand": "./startupscript/remount-on-restart.sh abc /config \"${templateOption:cloud}\" \"${templateOption:login}\"; /opt/llm-context/run-context-generator.sh /config || true", "features": { "ghcr.io/devcontainers/features/java@sha256:9663ce0219ff85786e87901ce5f0a59f488edd5f99b46015192cda48468b233a": { "version": "17" diff --git a/src/workbench-jupyter-with-llm/.devcontainer.json b/src/workbench-jupyter-with-llm/.devcontainer.json index 66e653a28..9502d3ce4 100644 --- a/src/workbench-jupyter-with-llm/.devcontainer.json +++ b/src/workbench-jupyter-with-llm/.devcontainer.json @@ -13,7 +13,7 @@ "${templateOption:login}" ], // re-mount bucket files on container start up, then generate LLM context - "postStartCommand": "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\"; /opt/llm-context/generate-context.sh /home/jupyter || true", + "postStartCommand": "./startupscript/remount-on-restart.sh jupyter /home/jupyter \"${templateOption:cloud}\" \"${templateOption:login}\"; /opt/llm-context/run-context-generator.sh /home/jupyter || true", "features": { "ghcr.io/devcontainers/features/node@sha256:8c0de46939b61958041700ee89e3493f3b2e4131a06dc46b4d9423427d06e5f6": { "version": "24.11.0" From f0075750b8a16d611d8719294aefe193a96c082e Mon Sep 17 00:00:00 2001 From: David Shen Date: Fri, 15 May 2026 07:54:15 -0400 Subject: [PATCH 85/86] Fix MCP server installation --- features/src/llm-context/generate-context.sh | 5 +- features/src/llm-context/install.sh | 2 +- .../wb-mcp-server/devcontainer-feature.json | 3 +- features/src/wb-mcp-server/install.sh | 52 +++++++++++++++---- 4 files changed, 49 insertions(+), 13 deletions(-) diff --git a/features/src/llm-context/generate-context.sh b/features/src/llm-context/generate-context.sh index d2b69619d..99534c57a 100755 --- a/features/src/llm-context/generate-context.sh +++ b/features/src/llm-context/generate-context.sh @@ -50,8 +50,9 @@ set -e -# Configuration -CONTEXT_DIR="${HOME}/.claude" +# Configuration — accept an optional home directory argument (e.g., /config, /home/jupyter) +USER_HOME="${1:-${HOME}}" +CONTEXT_DIR="${USER_HOME}/.claude" SKILLS_DIR="${CONTEXT_DIR}/skills" CLAUDE_FILE="${CONTEXT_DIR}/CLAUDE.md" diff --git a/features/src/llm-context/install.sh b/features/src/llm-context/install.sh index a9ea0102e..f702067bb 100644 --- a/features/src/llm-context/install.sh +++ b/features/src/llm-context/install.sh @@ -113,7 +113,7 @@ RETRY_DELAY=10 for i in \$(seq 1 \${MAX_RETRIES}); do if command -v wb &> /dev/null && wb workspace describe &> /dev/null; then echo "Workspace ready (attempt \${i}). Generating LLM context..." - ${GENERATE_SCRIPT} || echo "LLM context generation failed (non-fatal)" + ${GENERATE_SCRIPT} "${USER_HOME_DIR}" || echo "LLM context generation failed (non-fatal)" exit 0 fi echo "Waiting for workspace to be ready... (\${i}/\${MAX_RETRIES})" diff --git a/features/src/wb-mcp-server/devcontainer-feature.json b/features/src/wb-mcp-server/devcontainer-feature.json index cb2ccf782..ba210a8a5 100644 --- a/features/src/wb-mcp-server/devcontainer-feature.json +++ b/features/src/wb-mcp-server/devcontainer-feature.json @@ -23,7 +23,8 @@ "installsAfter": [ "ghcr.io/devcontainers/features/common-utils", "ghcr.io/devcontainers/features/go", - "./.devcontainer/features/gemini", + "ghcr.io/anthropics/devcontainer-features/claude-code", + "./.devcontainer/features/gemini-cli", "./.devcontainer/features/workbench-tools" ] } diff --git a/features/src/wb-mcp-server/install.sh b/features/src/wb-mcp-server/install.sh index 2d29dad3c..95e88fa15 100755 --- a/features/src/wb-mcp-server/install.sh +++ b/features/src/wb-mcp-server/install.sh @@ -170,17 +170,51 @@ EOF # Make the directory and files accessible to the user chown -R "${USERNAME}:" "${WB_MCP_DIR}" -# Auto-configure Claude CLI if available (HTTP transport) -if command -v claude &> /dev/null; then - echo "Found Claude CLI, attempting to add MCP server (HTTP)..." - su - "${USERNAME}" -c "claude mcp add --transport http wb http://127.0.0.1:${WB_MCP_PORT}" 2>/dev/null || true +# Configure Claude Code MCP server via settings file (works regardless of CLI install order) +CLAUDE_SETTINGS="${USER_HOME_DIR}/.claude.json" +if [[ -f "${CLAUDE_SETTINGS}" ]]; then + # Merge into existing settings + jq --arg url "http://127.0.0.1:${WB_MCP_PORT}" \ + '.mcpServers.wb = {"type": "http", "url": $url}' \ + "${CLAUDE_SETTINGS}" > "${CLAUDE_SETTINGS}.tmp" \ + && mv "${CLAUDE_SETTINGS}.tmp" "${CLAUDE_SETTINGS}" +else + cat > "${CLAUDE_SETTINGS}" < /dev/null; then - echo "Found Gemini CLI, attempting to add MCP server (HTTP)..." - su - "${USERNAME}" -c "gemini mcp add --scope user --transport http wb http://127.0.0.1:${WB_MCP_PORT}" 2>/dev/null || true +chown "${USERNAME}:" "${CLAUDE_SETTINGS}" +echo "Configured Claude Code MCP server in ${CLAUDE_SETTINGS}" + +# Configure Gemini CLI MCP server via settings file +GEMINI_SETTINGS="${USER_HOME_DIR}/.gemini/settings.json" +mkdir -p "${USER_HOME_DIR}/.gemini" +if [[ -f "${GEMINI_SETTINGS}" ]]; then + jq --arg url "http://127.0.0.1:${WB_MCP_PORT}" \ + '.mcpServers.wb = {"type": "http", "url": $url}' \ + "${GEMINI_SETTINGS}" > "${GEMINI_SETTINGS}.tmp" \ + && mv "${GEMINI_SETTINGS}.tmp" "${GEMINI_SETTINGS}" +else + cat > "${GEMINI_SETTINGS}" < Date: Fri, 15 May 2026 10:21:55 -0400 Subject: [PATCH 86/86] Run MCP server as user --- features/src/wb-mcp-server/install.sh | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/features/src/wb-mcp-server/install.sh b/features/src/wb-mcp-server/install.sh index 95e88fa15..9e650216f 100755 --- a/features/src/wb-mcp-server/install.sh +++ b/features/src/wb-mcp-server/install.sh @@ -116,25 +116,30 @@ WantedBy=multi-user.target EOF # Create a startup script that runs as HTTP daemon -cat > "${WB_MCP_DIR}/start-server.sh" <<'EOF' +cat > "${WB_MCP_DIR}/start-server.sh" < /dev/null; then +if pgrep -f "\${WB_MCP_BIN} -http" > /dev/null; then echo "wb-mcp-server is already running" exit 0 fi -# Start server in background -nohup "${WB_MCP_BIN}" -http -port "${PORT}" >> "${LOGFILE}" 2>&1 & -echo "Started wb-mcp-server on port ${PORT} (PID: $!)" -echo "Logs: ${LOGFILE}" +# Start server as the correct user (who has wb auth tokens) +if [ "\$(id -u)" = "0" ] && [ "\${RUN_USER}" != "root" ]; then + su - "\${RUN_USER}" -c "nohup \${WB_MCP_BIN} -http -port \${PORT} >> \${LOGFILE} 2>&1 &" +else + nohup "\${WB_MCP_BIN}" -http -port "\${PORT}" >> "\${LOGFILE}" 2>&1 & +fi +echo "Started wb-mcp-server on port \${PORT} as \${RUN_USER}" +echo "Logs: \${LOGFILE}" EOF chmod +x "${WB_MCP_DIR}/start-server.sh"