slackapi · WilliamBergamin · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026 · Jun 30, 2026
@@ -1,6 +1,9 @@
-# Ollama model used as the DeepEval judge for LLM-graded tests (make test-eval).
-# Any model available to your local Ollama install. Defaults to gemma4 if unset.
-OLLAMA_MODEL_NAME=gemma4
+# Google Gemini API key, required for the LLM-judged eval tests (make test-eval).
+# The eval suite fails when this is unset.
+GEMINI_API_KEY=
+
+# Gemini model used as the DeepEval judge. Defaults to gemini-3.1-flash-lite if unset.
+GEMINI_MODEL_NAME=gemini-3.1-flash-lite
 
 # Slack MCP bearer token, required ONLY for the MCP tool-selection test
 # (tests/eval/test_tool_selection.py). Leave blank to skip that test.

@@ -5,6 +5,8 @@ on:
     branches:
       - main
   pull_request:
+  schedule:
+    - cron: "17 7 * * *" # nightly, off-peak, off-:00
   workflow_dispatch:
 
 env:
@@ -48,3 +50,46 @@ jobs:
         run: make install-test
       - name: Run unit tests
         run: make test-unit
+
+  eval:
+    name: Evaluation tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    permissions:
+      contents: read
+    steps:
+      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        with:
+          persist-credentials: false
+      - name: Set up Python ${{ env.SUPPORTED_PY }}
+        uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
+        with:
+          python-version: ${{ env.SUPPORTED_PY }}
+      - name: Install test dependencies
+        run: make install-test
+      - name: Run eval tests
+        run: make test-eval
+        env:
+          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+          SLACK_MCP_TOKEN: ${{ secrets.SLACK_MCP_TOKEN }}
+
+  notifications:
+    name: Regression notifications
+    runs-on: ubuntu-latest
+    needs:
+      - lint
+      - test
+      - eval
+    if: ${{ !success() && github.ref == 'refs/heads/main' && github.event_name != 'workflow_dispatch' }}
+    permissions:
+      contents: read
+    steps:
+      - name: Send notifications of failing tests
+        uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3
+        with:
+          errors: true
+          webhook: ${{ secrets.SLACK_REGRESSION_FAILURES_WEBHOOK_URL }}
+          webhook-type: webhook-trigger
+          payload: |
+            action_url: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+            repository: "${{ github.repository }}"
@@ -12,23 +12,23 @@ This plugin integrates Slack with Ai tools, providing tools to search, read, and
 
 ## Development Commands
 
-Requires Python 3.14+. Run `make install` before first use to set up the virtual environment, test dependencies, and local Ollama instance.
+Requires Python 3.14+. Run `make install` before first use to set up the virtual environment and test dependencies.
 
-**Always use the `make` targets — never invoke `python`, `pytest`, or `ruff` directly.** The targets manage the virtualenv, load `.env`, and start/stop the local Ollama instance for you; running the underlying tools by hand skips that setup and will behave differently. If a `make` command is broken or missing something you need, fix the `Makefile` rather than working around it with the raw command.
+**Always use the `make` targets — never invoke `python`, `pytest`, or `ruff` directly.** The targets manage the virtualenv and load `.env` for you; running the underlying tools by hand skips that setup and will behave differently. If a `make` command is broken or missing something you need, fix the `Makefile` rather than working around it with the raw command.
 
 | Command | Purpose |
 |---------|---------|
-| `make install` | Full setup: venv + deps + Ollama + gemma4 model |
+| `make install` | Full setup: venv + deps |
 | `make lint` | Ruff linter (line-length=120) |
 | `make format` | Ruff auto-format + fix |
 | `make test-unit` | Fast validation tests (pytest) |
-| `make test-eval` | LLM-judged tests (starts Ollama, runs DeepEval, stops Ollama) |
+| `make test-eval` | LLM-judged tests (runs DeepEval against Gemini) |
 | `make test` | Both unit + eval tests |
-| `make clean` | Remove .venv and .ollama |
+| `make clean` | Remove .venv |
 | `make cursor-install` | Install this plugin into a local Cursor for development |
 | `make cursor-uninstall` | Uninstall this plugin from the local Cursor install |
 
-The LLM tests read two environment variables: `OLLAMA_MODEL_NAME` (the DeepEval judge model, defaults to `gemma4`) and `SLACK_MCP_TOKEN` (a Slack MCP bearer token; the MCP tool-selection test is skipped when it's unset). Copy `.env.example` to `.env` and fill in values — the `Makefile` auto-loads `.env` — or pass them inline, e.g. `OLLAMA_MODEL_NAME=<model> make test-eval`.
+The LLM tests read `GEMINI_API_KEY` (required — the eval suite fails when it's unset) and `SLACK_MCP_TOKEN` (a Slack MCP bearer token; the MCP tool-selection test is skipped when it's unset). The DeepEval judge model defaults to `gemini-3.1-flash-lite`, overridable via `GEMINI_MODEL_NAME`. Copy `.env.example` to `.env` and fill in values — the `Makefile` auto-loads `.env` — or pass them inline, e.g. `GEMINI_MODEL_NAME=<model> make test-eval`.
 
 ## Cross-Skill References
 
@@ -46,18 +46,19 @@ See `skills/create-slack-app/SKILL.md` Step 1a for an example.
 Two test layers validate skills:
 
 1. **Unit** (`tests/unit/`) — validates frontmatter fields, naming, and markdown structure. Fast, runs in CI on every PR.
-2. **Eval** (`tests/eval/`) — uses DeepEval's `ToolCorrectnessMetric` (threshold 0.8) with a local Ollama model to judge whether a skill produces useful output for a sample prompt. Local-only, not in CI.
+2. **Eval** (`tests/eval/`) — LLM-judged tests that use a Gemini model. `tests/eval/test_tool_selection.py` asks the model to pick the expected tool/skill for each of a set of prompts. Because Gemini's free tier caps at 15 requests/minute, the test sleeps ~5s between scenarios (see its `teardown_method`) to stay under the limit.
 
-To add an LLM test for a new skill, create `tests/eval/skills/test_<skill_name>.py` following the pattern in `test_block_kit.py`: define a `PROMPT`, load the skill with `load_skill()`, and assert with `ToolCorrectnessMetric`.
+To add an eval scenario, append a `Scenario` (prompt + expected tool) to `SCENARIOS` in `tests/eval/test_tool_selection.py`.
 
 ## CI
 
 GitHub Actions (`.github/workflows/ci-build.yml`) gates every PR with:
 
 - **Lint** — `make lint` (Ruff)
 - **Test** — `make test-unit` (pytest)
+- **Eval** — `make test-eval` (DeepEval + Gemini)
 
-LLM-judged tests are not run in CI (Ollama + model download would exceed time budget).
+The eval job reads the `GEMINI_API_KEY` and `SLACK_MCP_TOKEN` repository secrets; it skips on PRs from forks, which don't receive secrets. The workflow also runs nightly on a schedule, and a `notifications` job posts to Slack (via `SLACK_REGRESSION_FAILURES_WEBHOOK_URL`) when a job fails on `main`.
 
 ## Releasing
 

@@ -10,13 +10,6 @@ DEEPEVAL := $(VENV)/bin/deepeval
 -include .env
 export
 
-OLLAMA_DIR := .ollama
-OLLAMA_BIN := $(OLLAMA_DIR)/bin/ollama
-OLLAMA_MODELS := $(OLLAMA_DIR)/models
-OLLAMA_MODEL := $(or $(OLLAMA_MODEL_NAME),gemma4)
-
-UNAME_S := $(shell uname -s)
-
 TARGETS := help install install-test install-tools clean lint format test test-unit test-eval cursor-install cursor-uninstall
 
 .PHONY: $(TARGETS)
@@ -28,32 +21,7 @@ $(VENV):
 	python3 -m venv $(VENV)
 	$(PIP) install --upgrade pip
 
-$(OLLAMA_BIN):
-	mkdir -p $(OLLAMA_DIR)/bin $(OLLAMA_MODELS)
-ifeq ($(UNAME_S),Darwin)
-	curl -fSL "https://github.com/ollama/ollama/releases/latest/download/ollama-darwin.tgz" | tar xz -C $(OLLAMA_DIR)/bin
-else
-	curl -fSL "https://github.com/ollama/ollama/releases/latest/download/ollama-linux-amd64.tar.zst" | zstd -d | tar x -C $(OLLAMA_DIR) --strip-components=0
-endif
-	chmod +x $(OLLAMA_BIN)
-
-install: install-test install-tools $(OLLAMA_BIN) ## Set up everything (venv + deps + Ollama)
-	@OLLAMA_PID=""; \
-	if ! OLLAMA_MODELS=$(OLLAMA_MODELS) $(OLLAMA_BIN) list > /dev/null 2>&1; then \
-		echo "Starting Ollama server..."; \
-		OLLAMA_MODELS=$(OLLAMA_MODELS) $(OLLAMA_BIN) serve > /dev/null 2>&1 & \
-		OLLAMA_PID=$$!; \
-		for i in $$(seq 1 30); do \
-			curl -sf http://localhost:11434/api/version > /dev/null 2>&1 && break; \
-			sleep 1; \
-		done; \
-	fi; \
-	OLLAMA_MODELS=$(OLLAMA_MODELS) $(OLLAMA_BIN) pull $(OLLAMA_MODEL); \
-	$(DEEPEVAL) set-ollama --model=$(OLLAMA_MODEL); \
-	if [ -n "$$OLLAMA_PID" ]; then \
-		echo "Stopping Ollama server (PID $$OLLAMA_PID)..."; \
-		kill $$OLLAMA_PID 2>/dev/null; \
-	fi
+install: install-test install-tools ## Set up everything (venv + deps)
 
 install-test: $(VENV) ## Install test dependencies (deepeval)
 	$(PIP) install --upgrade pip
@@ -63,9 +31,9 @@ install-tools: $(VENV) ## Install linting/formatting tools (ruff)
 	$(PIP) install --upgrade pip
 	$(PIP) install -e ".[tools]"
 
-clean: ## Remove virtual environment, Ollama, and local Cursor install
+clean: ## Remove virtual environment and local Cursor install
 	-$(PYTHON) scripts/cursor.py uninstall
-	rm -rf $(VENV) $(OLLAMA_DIR) node_modules
+	rm -rf $(VENV) node_modules
 
 cursor-install: $(VENV) ## Install this plugin into a local Cursor for development
 	$(PYTHON) scripts/cursor.py install
@@ -95,21 +63,5 @@ endif
 test-unit: ## Run structural/unit validation tests (set testdir=<path> to target specific files)
 	$(PYTHON) -m pytest $(or $(testdir),tests/unit/) -v
 
-test-eval: ## Run LLM-judged tests (requires Ollama; set testdir=<path> to target specific files)
-	@OLLAMA_PID=""; \
-	if ! OLLAMA_MODELS=$(OLLAMA_MODELS) $(OLLAMA_BIN) list > /dev/null 2>&1; then \
-		echo "Starting Ollama server..."; \
-		OLLAMA_MODELS=$(OLLAMA_MODELS) $(OLLAMA_BIN) serve > /dev/null 2>&1 & \
-		OLLAMA_PID=$$!; \
-		for i in $$(seq 1 30); do \
-			curl -sf http://localhost:11434/api/version > /dev/null 2>&1 && break; \
-			sleep 1; \
-		done; \
-	fi; \
-	$(DEEPEVAL) test run $(or $(testdir),tests/eval/) -v; \
-	TEST_EXIT=$$?; \
-	if [ -n "$$OLLAMA_PID" ]; then \
-		echo "Stopping Ollama server (PID $$OLLAMA_PID)..."; \
-		kill $$OLLAMA_PID 2>/dev/null; \
-	fi; \
-	exit $$TEST_EXIT
+test-eval: ## Run LLM-judged tests (requires GEMINI_API_KEY; set testdir=<path> to target specific files)
+	$(DEEPEVAL) test run $(or $(testdir),tests/eval/) -v
@@ -11,7 +11,7 @@ test = [
     # non-existent `deepeval.deepeval.config.settings`, so `deepeval test run`
     # (used by `make test-eval`) crashes on import. Exclude that one release.
     "deepeval>=4.0,<5.0,!=4.0.6",
-    "ollama>=0.4,<1.0",
+    "google-genai>=1.0,<2.0",
     "pyyaml>=6.0,<7.0",
 ]
 tools = ["ruff>=0.11,<1.0"]

@@ -12,8 +12,9 @@
 # Skill inventory (single source of truth)
 EXPECTED_SKILLS = ("create-slack-app", "block-kit", "slack-api", "slack-cli")
 
-# Ollama judge model
-OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL_NAME", "gemma4")
+# Gemini judge model
+GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
+GEMINI_MODEL = os.environ.get("GEMINI_MODEL_NAME", "gemini-3.1-flash-lite")
 
 # Slack MCP server
 SLACK_MCP_URL = "https://mcp.slack.com/mcp"