diff --git a/.env.example b/.env.example
index 1846cc6..2d6beff 100644
--- a/.env.example
+++ b/.env.example
@@ -1,6 +1,9 @@
-# Ollama model used as the DeepEval judge for LLM-graded tests (make test-eval).
-# Any model available to your local Ollama install. Defaults to gemma4 if unset.
-OLLAMA_MODEL_NAME=gemma4
+# Google Gemini API key, required for the LLM-judged eval tests (make test-eval).
+# The eval suite fails when this is unset.
+GEMINI_API_KEY=
+
+# Gemini model used as the DeepEval judge. Defaults to gemini-3.1-flash-lite if unset.
+GEMINI_MODEL_NAME=gemini-3.1-flash-lite
 
 # Slack MCP bearer token, required ONLY for the MCP tool-selection test
 # (tests/eval/test_tool_selection.py). Leave blank to skip that test.
diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml
index 67d6d69..1598003 100644
--- a/.github/workflows/ci-build.yml
+++ b/.github/workflows/ci-build.yml
@@ -5,6 +5,8 @@ on:
     branches:
       - main
   pull_request:
+  schedule:
+    - cron: "17 7 * * *" # nightly, off-peak, off-:00
   workflow_dispatch:
 
 env:
@@ -48,3 +50,46 @@ jobs:
         run: make install-test
       - name: Run unit tests
         run: make test-unit
+
+  eval:
+    name: Evaluation tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    permissions:
+      contents: read
+    steps:
+      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+        with:
+          persist-credentials: false
+      - name: Set up Python ${{ env.SUPPORTED_PY }}
+        uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
+        with:
+          python-version: ${{ env.SUPPORTED_PY }}
+      - name: Install test dependencies
+        run: make install-test
+      - name: Run eval tests
+        run: make test-eval
+        env:
+          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+          SLACK_MCP_TOKEN: ${{ secrets.SLACK_MCP_TOKEN }}
+
+  notifications:
+    name: Regression notifications
+    runs-on: ubuntu-latest
+    needs:
+      - lint
+      - test
+      - eval
+    if: ${{ !success() && github.ref == 'refs/heads/main' && github.event_name != 'workflow_dispatch' }}
+    permissions:
+      contents: read
+    steps:
+      - name: Send notifications of failing tests
+        uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3
+        with:
+          errors: true
+          webhook: ${{ secrets.SLACK_REGRESSION_FAILURES_WEBHOOK_URL }}
+          webhook-type: webhook-trigger
+          payload: |
+            action_url: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+            repository: "${{ github.repository }}"
diff --git a/AGENTS.md b/AGENTS.md
index 4af57f3..fbc4b64 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -12,23 +12,23 @@ This plugin integrates Slack with Ai tools, providing tools to search, read, and
 
 ## Development Commands
 
-Requires Python 3.14+. Run `make install` before first use to set up the virtual environment, test dependencies, and local Ollama instance.
+Requires Python 3.14+. Run `make install` before first use to set up the virtual environment and test dependencies.
 
-**Always use the `make` targets — never invoke `python`, `pytest`, or `ruff` directly.** The targets manage the virtualenv, load `.env`, and start/stop the local Ollama instance for you; running the underlying tools by hand skips that setup and will behave differently. If a `make` command is broken or missing something you need, fix the `Makefile` rather than working around it with the raw command.
+**Always use the `make` targets — never invoke `python`, `pytest`, or `ruff` directly.** The targets manage the virtualenv and load `.env` for you; running the underlying tools by hand skips that setup and will behave differently. If a `make` command is broken or missing something you need, fix the `Makefile` rather than working around it with the raw command.
 
 | Command | Purpose |
 |---------|---------|
-| `make install` | Full setup: venv + deps + Ollama + gemma4 model |
+| `make install` | Full setup: venv + deps |
 | `make lint` | Ruff linter (line-length=120) |
 | `make format` | Ruff auto-format + fix |
 | `make test-unit` | Fast validation tests (pytest) |
-| `make test-eval` | LLM-judged tests (starts Ollama, runs DeepEval, stops Ollama) |
+| `make test-eval` | LLM-judged tests (runs DeepEval against Gemini) |
 | `make test` | Both unit + eval tests |
-| `make clean` | Remove .venv and .ollama |
+| `make clean` | Remove .venv |
 | `make cursor-install` | Install this plugin into a local Cursor for development |
 | `make cursor-uninstall` | Uninstall this plugin from the local Cursor install |
 
-The LLM tests read two environment variables: `OLLAMA_MODEL_NAME` (the DeepEval judge model, defaults to `gemma4`) and `SLACK_MCP_TOKEN` (a Slack MCP bearer token; the MCP tool-selection test is skipped when it's unset). Copy `.env.example` to `.env` and fill in values — the `Makefile` auto-loads `.env` — or pass them inline, e.g. `OLLAMA_MODEL_NAME=<model> make test-eval`.
+The LLM tests read `GEMINI_API_KEY` (required — the eval suite fails when it's unset) and `SLACK_MCP_TOKEN` (a Slack MCP bearer token; the MCP tool-selection test is skipped when it's unset). The DeepEval judge model defaults to `gemini-3.1-flash-lite`, overridable via `GEMINI_MODEL_NAME`. Copy `.env.example` to `.env` and fill in values — the `Makefile` auto-loads `.env` — or pass them inline, e.g. `GEMINI_MODEL_NAME=<model> make test-eval`.
 
 ## Cross-Skill References
 
@@ -46,9 +46,9 @@ See `skills/create-slack-app/SKILL.md` Step 1a for an example.
 Two test layers validate skills:
 
 1. **Unit** (`tests/unit/`) — validates frontmatter fields, naming, and markdown structure. Fast, runs in CI on every PR.
-2. **Eval** (`tests/eval/`) — uses DeepEval's `ToolCorrectnessMetric` (threshold 0.8) with a local Ollama model to judge whether a skill produces useful output for a sample prompt. Local-only, not in CI.
+2. **Eval** (`tests/eval/`) — LLM-judged tests that use a Gemini model. `tests/eval/test_tool_selection.py` asks the model to pick the expected tool/skill for each of a set of prompts. Because Gemini's free tier caps at 15 requests/minute, the test sleeps ~5s between scenarios (see its `teardown_method`) to stay under the limit.
 
-To add an LLM test for a new skill, create `tests/eval/skills/test_<skill_name>.py` following the pattern in `test_block_kit.py`: define a `PROMPT`, load the skill with `load_skill()`, and assert with `ToolCorrectnessMetric`.
+To add an eval scenario, append a `Scenario` (prompt + expected tool) to `SCENARIOS` in `tests/eval/test_tool_selection.py`.
 
 ## CI
 
@@ -56,8 +56,9 @@ GitHub Actions (`.github/workflows/ci-build.yml`) gates every PR with:
 
 - **Lint** — `make lint` (Ruff)
 - **Test** — `make test-unit` (pytest)
+- **Eval** — `make test-eval` (DeepEval + Gemini)
 
-LLM-judged tests are not run in CI (Ollama + model download would exceed time budget).
+The eval job reads the `GEMINI_API_KEY` and `SLACK_MCP_TOKEN` repository secrets; it skips on PRs from forks, which don't receive secrets. The workflow also runs nightly on a schedule, and a `notifications` job posts to Slack (via `SLACK_REGRESSION_FAILURES_WEBHOOK_URL`) when a job fails on `main`.
 
 ## Releasing
 
diff --git a/Makefile b/Makefile
index a700410..fa24573 100644
--- a/Makefile
+++ b/Makefile
@@ -10,13 +10,6 @@ DEEPEVAL := $(VENV)/bin/deepeval
 -include .env
 export
 
-OLLAMA_DIR := .ollama
-OLLAMA_BIN := $(OLLAMA_DIR)/bin/ollama
-OLLAMA_MODELS := $(OLLAMA_DIR)/models
-OLLAMA_MODEL := $(or $(OLLAMA_MODEL_NAME),gemma4)
-
-UNAME_S := $(shell uname -s)
-
 TARGETS := help install install-test install-tools clean lint format test test-unit test-eval cursor-install cursor-uninstall
 
 .PHONY: $(TARGETS)
@@ -28,32 +21,7 @@ $(VENV):
 	python3 -m venv $(VENV)
 	$(PIP) install --upgrade pip
 
-$(OLLAMA_BIN):
-	mkdir -p $(OLLAMA_DIR)/bin $(OLLAMA_MODELS)
-ifeq ($(UNAME_S),Darwin)
-	curl -fSL "https://github.com/ollama/ollama/releases/latest/download/ollama-darwin.tgz" | tar xz -C $(OLLAMA_DIR)/bin
-else
-	curl -fSL "https://github.com/ollama/ollama/releases/latest/download/ollama-linux-amd64.tar.zst" | zstd -d | tar x -C $(OLLAMA_DIR) --strip-components=0
-endif
-	chmod +x $(OLLAMA_BIN)
-
-install: install-test install-tools $(OLLAMA_BIN) ## Set up everything (venv + deps + Ollama)
-	@OLLAMA_PID=""; \
-	if ! OLLAMA_MODELS=$(OLLAMA_MODELS) $(OLLAMA_BIN) list > /dev/null 2>&1; then \
-		echo "Starting Ollama server..."; \
-		OLLAMA_MODELS=$(OLLAMA_MODELS) $(OLLAMA_BIN) serve > /dev/null 2>&1 & \
-		OLLAMA_PID=$$!; \
-		for i in $$(seq 1 30); do \
-			curl -sf http://localhost:11434/api/version > /dev/null 2>&1 && break; \
-			sleep 1; \
-		done; \
-	fi; \
-	OLLAMA_MODELS=$(OLLAMA_MODELS) $(OLLAMA_BIN) pull $(OLLAMA_MODEL); \
-	$(DEEPEVAL) set-ollama --model=$(OLLAMA_MODEL); \
-	if [ -n "$$OLLAMA_PID" ]; then \
-		echo "Stopping Ollama server (PID $$OLLAMA_PID)..."; \
-		kill $$OLLAMA_PID 2>/dev/null; \
-	fi
+install: install-test install-tools ## Set up everything (venv + deps)
 
 install-test: $(VENV) ## Install test dependencies (deepeval)
 	$(PIP) install --upgrade pip
@@ -63,9 +31,9 @@ install-tools: $(VENV) ## Install linting/formatting tools (ruff)
 	$(PIP) install --upgrade pip
 	$(PIP) install -e ".[tools]"
 
-clean: ## Remove virtual environment, Ollama, and local Cursor install
+clean: ## Remove virtual environment and local Cursor install
 	-$(PYTHON) scripts/cursor.py uninstall
-	rm -rf $(VENV) $(OLLAMA_DIR) node_modules
+	rm -rf $(VENV) node_modules
 
 cursor-install: $(VENV) ## Install this plugin into a local Cursor for development
 	$(PYTHON) scripts/cursor.py install
@@ -95,21 +63,5 @@ endif
 test-unit: ## Run structural/unit validation tests (set testdir=<path> to target specific files)
 	$(PYTHON) -m pytest $(or $(testdir),tests/unit/) -v
 
-test-eval: ## Run LLM-judged tests (requires Ollama; set testdir=<path> to target specific files)
-	@OLLAMA_PID=""; \
-	if ! OLLAMA_MODELS=$(OLLAMA_MODELS) $(OLLAMA_BIN) list > /dev/null 2>&1; then \
-		echo "Starting Ollama server..."; \
-		OLLAMA_MODELS=$(OLLAMA_MODELS) $(OLLAMA_BIN) serve > /dev/null 2>&1 & \
-		OLLAMA_PID=$$!; \
-		for i in $$(seq 1 30); do \
-			curl -sf http://localhost:11434/api/version > /dev/null 2>&1 && break; \
-			sleep 1; \
-		done; \
-	fi; \
-	$(DEEPEVAL) test run $(or $(testdir),tests/eval/) -v; \
-	TEST_EXIT=$$?; \
-	if [ -n "$$OLLAMA_PID" ]; then \
-		echo "Stopping Ollama server (PID $$OLLAMA_PID)..."; \
-		kill $$OLLAMA_PID 2>/dev/null; \
-	fi; \
-	exit $$TEST_EXIT
+test-eval: ## Run LLM-judged tests (requires GEMINI_API_KEY; set testdir=<path> to target specific files)
+	$(DEEPEVAL) test run $(or $(testdir),tests/eval/) -v
diff --git a/pyproject.toml b/pyproject.toml
index 88d33e2..2d5c764 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ test = [
     # non-existent `deepeval.deepeval.config.settings`, so `deepeval test run`
     # (used by `make test-eval`) crashes on import. Exclude that one release.
     "deepeval>=4.0,<5.0,!=4.0.6",
-    "ollama>=0.4,<1.0",
+    "google-genai>=1.0,<2.0",
     "pyyaml>=6.0,<7.0",
 ]
 tools = ["ruff>=0.11,<1.0"]
diff --git a/tests/config.py b/tests/config.py
index 376fc66..4d4ec1c 100644
--- a/tests/config.py
+++ b/tests/config.py
@@ -12,8 +12,9 @@
 # Skill inventory (single source of truth)
 EXPECTED_SKILLS = ("create-slack-app", "block-kit", "slack-api", "slack-cli")
 
-# Ollama judge model
-OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL_NAME", "gemma4")
+# Gemini judge model
+GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "")
+GEMINI_MODEL = os.environ.get("GEMINI_MODEL_NAME", "gemini-3.1-flash-lite")
 
 # Slack MCP server
 SLACK_MCP_URL = "https://mcp.slack.com/mcp"
diff --git a/tests/eval/skills/__init__.py b/tests/eval/skills/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/eval/skills/test_block_kit.py b/tests/eval/skills/test_block_kit.py
deleted file mode 100644
index 422eb23..0000000
--- a/tests/eval/skills/test_block_kit.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from deepeval import assert_test
-from deepeval.metrics import ToolCorrectnessMetric
-from deepeval.test_case import LLMTestCase, ToolCall
-
-from tests.config import OLLAMA_MODEL
-from tests.skill import load_skill
-from tests.support.ollama import NoThinkOllamaModel
-
-PROMPT = "Build me a Slack modal with a text input and a submit button"
-
-
-class TestBlockKit:
-    def setup_method(self):
-        self.skill = load_skill("block-kit")
-        self.model = NoThinkOllamaModel(model=OLLAMA_MODEL)
-
-    def test_skill_is_usable(self):
-        skill_tool = ToolCall(
-            name=self.skill.metadata.name,
-            description=self.skill.metadata.description,
-            input_parameters={"request": PROMPT},
-            output=self.skill.body,
-        )
-
-        expected_tool = ToolCall(
-            name=self.skill.metadata.name,
-            input_parameters={"request": PROMPT},
-            output=self.skill.body,
-        )
-
-        response, _ = self.model.generate(
-            f"You have access to the following skill:\n\n"
-            f"Name: {skill_tool.name}\n"
-            f"Description: {skill_tool.description}\n\n"
-            f"User request: {PROMPT}"
-        )
-
-        test_case = LLMTestCase(
-            input=PROMPT,
-            actual_output=response,
-            tools_called=[skill_tool],
-            expected_tools=[expected_tool],
-        )
-
-        metric = ToolCorrectnessMetric(model=self.model, threshold=0.8)
-        assert_test(test_case, [metric])
diff --git a/tests/eval/skills/test_create_slack_app.py b/tests/eval/skills/test_create_slack_app.py
deleted file mode 100644
index 020623e..0000000
--- a/tests/eval/skills/test_create_slack_app.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from deepeval import assert_test
-from deepeval.metrics import ToolCorrectnessMetric
-from deepeval.test_case import LLMTestCase, ToolCall
-
-from tests.config import OLLAMA_MODEL
-from tests.skill import load_skill
-from tests.support.ollama import NoThinkOllamaModel
-
-PROMPT = "Create a new Slack app with a slash command and event subscription"
-
-
-class TestCreateSlackApp:
-    def setup_method(self):
-        self.skill = load_skill("create-slack-app")
-        self.model = NoThinkOllamaModel(model=OLLAMA_MODEL)
-
-    def test_skill_is_usable(self):
-        skill_tool = ToolCall(
-            name=self.skill.metadata.name,
-            description=self.skill.metadata.description,
-            input_parameters={"request": PROMPT},
-            output=self.skill.body,
-        )
-
-        expected_tool = ToolCall(
-            name=self.skill.metadata.name,
-            input_parameters={"request": PROMPT},
-            output=self.skill.body,
-        )
-
-        response, _ = self.model.generate(
-            f"You have access to the following skill:\n\n"
-            f"Name: {skill_tool.name}\n"
-            f"Description: {skill_tool.description}\n\n"
-            f"User request: {PROMPT}"
-        )
-
-        test_case = LLMTestCase(
-            input=PROMPT,
-            actual_output=response,
-            tools_called=[skill_tool],
-            expected_tools=[expected_tool],
-        )
-
-        metric = ToolCorrectnessMetric(model=self.model, threshold=0.8)
-        assert_test(test_case, [metric])
diff --git a/tests/eval/skills/test_slack_api.py b/tests/eval/skills/test_slack_api.py
deleted file mode 100644
index 1c636d5..0000000
--- a/tests/eval/skills/test_slack_api.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from deepeval import assert_test
-from deepeval.metrics import ToolCorrectnessMetric
-from deepeval.test_case import LLMTestCase, ToolCall
-
-from tests.config import OLLAMA_MODEL
-from tests.skill import load_skill
-from tests.support.ollama import NoThinkOllamaModel
-
-PROMPT = "Which Slack API method lists the members of a channel, and what scopes does it need?"
-
-
-class TestSlackApi:
-    def setup_method(self):
-        self.skill = load_skill("slack-api")
-        self.model = NoThinkOllamaModel(model=OLLAMA_MODEL)
-
-    def test_skill_is_usable(self):
-        skill_tool = ToolCall(
-            name=self.skill.metadata.name,
-            description=self.skill.metadata.description,
-            input_parameters={"request": PROMPT},
-            output=self.skill.body,
-        )
-
-        expected_tool = ToolCall(
-            name=self.skill.metadata.name,
-            input_parameters={"request": PROMPT},
-            output=self.skill.body,
-        )
-
-        response, _ = self.model.generate(
-            f"You have access to the following skill:\n\n"
-            f"Name: {skill_tool.name}\n"
-            f"Description: {skill_tool.description}\n\n"
-            f"User request: {PROMPT}"
-        )
-
-        test_case = LLMTestCase(
-            input=PROMPT,
-            actual_output=response,
-            tools_called=[skill_tool],
-            expected_tools=[expected_tool],
-        )
-
-        metric = ToolCorrectnessMetric(model=self.model, threshold=0.8)
-        assert_test(test_case, [metric])
diff --git a/tests/eval/skills/test_slack_cli.py b/tests/eval/skills/test_slack_cli.py
deleted file mode 100644
index 1b55b31..0000000
--- a/tests/eval/skills/test_slack_cli.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from deepeval import assert_test
-from deepeval.metrics import ToolCorrectnessMetric
-from deepeval.test_case import LLMTestCase, ToolCall
-
-from tests.config import OLLAMA_MODEL
-from tests.skill import load_skill
-from tests.support.ollama import NoThinkOllamaModel
-
-PROMPT = "How do I create and deploy a new Slack app using the CLI?"
-
-
-class TestSlackCli:
-    def setup_method(self):
-        self.skill = load_skill("slack-cli")
-        self.model = NoThinkOllamaModel(model=OLLAMA_MODEL)
-
-    def test_skill_is_usable(self):
-        skill_tool = ToolCall(
-            name=self.skill.metadata.name,
-            description=self.skill.metadata.description,
-            input_parameters={"request": PROMPT},
-            output=self.skill.body,
-        )
-
-        expected_tool = ToolCall(
-            name=self.skill.metadata.name,
-            input_parameters={"request": PROMPT},
-            output=self.skill.body,
-        )
-
-        response, _ = self.model.generate(
-            f"You have access to the following skill:\n\n"
-            f"Name: {skill_tool.name}\n"
-            f"Description: {skill_tool.description}\n\n"
-            f"User request: {PROMPT}"
-        )
-
-        test_case = LLMTestCase(
-            input=PROMPT,
-            actual_output=response,
-            tools_called=[skill_tool],
-            expected_tools=[expected_tool],
-        )
-
-        metric = ToolCorrectnessMetric(model=self.model, threshold=0.8)
-        assert_test(test_case, [metric])
diff --git a/tests/eval/test_tool_selection.py b/tests/eval/test_tool_selection.py
index c3d135b..41e4dc7 100644
--- a/tests/eval/test_tool_selection.py
+++ b/tests/eval/test_tool_selection.py
@@ -1,11 +1,13 @@
-from typing import TypedDict
+import time
+from typing import NotRequired, TypedDict
 
 import pytest
+from deepeval.models import GeminiModel
 from deepeval.test_case import ToolCall
 from pydantic import BaseModel
 
-from tests.config import OLLAMA_MODEL, SLACK_MCP_TOKEN
-from tests.support.ollama import NoThinkOllamaModel
+from tests.config import GEMINI_API_KEY, SLACK_MCP_TOKEN
+from tests.support.judge import make_judge_model
 from tests.support.tools import get_all_skill_tools, get_slack_mcp_tools
 
 
@@ -13,6 +15,7 @@ class Scenario(TypedDict):
     id: str
     prompt: str
     expected_tool: str
+    acceptable_tools: NotRequired[list[str]]
 
 
 class ToolChoice(BaseModel):
@@ -49,7 +52,7 @@ class ToolChoice(BaseModel):
     },
     {
         "id": "list-members-platform-team",
-        "prompt": "Who are the members of the #platform-team channel?",
+        "prompt": "Who are the members of the CA1B2C3F5 channel?",
         "expected_tool": "slack_list_channel_members",
     },
     {
@@ -89,7 +92,7 @@ class ToolChoice(BaseModel):
     },
     {
         "id": "ambiguous-list-members-platform",
-        "prompt": "List the members of the #platform-team channel",
+        "prompt": "List the members of the CA1B2C3F5 channel",
         "expected_tool": "slack_list_channel_members",
     },
     {
@@ -106,16 +109,19 @@ class ToolChoice(BaseModel):
         "id": "ambiguous-add-reaction-releases",
         "prompt": "Add a :tada: reaction to the latest message in #releases",
         "expected_tool": "slack_add_reaction",
+        "acceptable_tools": ["slack_read_channel"],
     },
     {
         "id": "ambiguous-reply-in-thread",
-        "prompt": "Post a reply in the thread on the outage message in #incidents",
+        "prompt": "Reply 'we're on it' in the thread on the outage message in CA1B2C3F5",
         "expected_tool": "slack_send_message",
+        "acceptable_tools": ["slack_read_thread"],
     },
     {
         "id": "ambiguous-read-thread-replies",
-        "prompt": "Show me all the replies in that thread in #support",
+        "prompt": "Show me all the replies in the thread on the latest message in #support",
         "expected_tool": "slack_read_thread",
+        "acceptable_tools": ["slack_read_channel"],
     },
     {
         "id": "ambiguous-lookup-user-by-email",
@@ -174,23 +180,34 @@ def build_prompt(available_tools: list[ToolCall], prompt: str) -> str:
 
 User request: {prompt}
 
-Pick the single best tool for this request and respond with its exact name."""
+Pick the single tool that performs the action the user is asking for. Any channel name,
+channel ID, or user ID already in the request is usable as-is — do not pick a search tool
+just to resolve it into an ID first. Respond with the tool's exact name."""
 
 
-@pytest.mark.skipif(not SLACK_MCP_TOKEN, reason="SLACK_MCP_TOKEN not set")
 class TestToolSelection:
     """Assert the model selects the expected tool for each scenario."""
 
-    model: NoThinkOllamaModel
+    model: GeminiModel
     available_tools: list[ToolCall]
 
     @classmethod
     def setup_class(cls):
+        if not GEMINI_API_KEY:
+            pytest.fail("GEMINI_API_KEY not set")
+        if not SLACK_MCP_TOKEN:
+            pytest.fail("SLACK_MCP_TOKEN not set")
         # Fetch tools once for the whole class: the MCP list is one network
         # round-trip, and skills are read from disk.
-        cls.model = NoThinkOllamaModel(model=OLLAMA_MODEL)
+        cls.model = make_judge_model()
         cls.available_tools = get_slack_mcp_tools() + get_all_skill_tools()
 
+    def teardown_method(self):
+        # Gemini's free tier allows only 15 requests/minute. Each scenario makes one
+        # model.generate() call, so sleep between scenarios to stay well under the
+        # limit (~12 req/min) and avoid HTTP 429 / RESOURCE_EXHAUSTED.
+        time.sleep(5)
+
     @pytest.mark.parametrize(
         "scenario",
         SCENARIOS,
@@ -205,6 +222,7 @@ def test_tool_selection(self, scenario: Scenario):
         # against the expected one.
         choice, _ = self.model.generate(build_prompt(self.available_tools, scenario["prompt"]), schema=ToolChoice)
 
-        assert choice.tool_name == expected_name, (
-            f"Expected {repr(expected_name)} for prompt {repr(scenario['prompt'])}, got {repr(choice.tool_name)}"
+        accepted = {expected_name, *scenario.get("acceptable_tools", [])}
+        assert choice.tool_name in accepted, (
+            f"Expected one of {sorted(accepted)} for prompt {scenario['prompt']!r}, got {choice.tool_name!r}"
         )
diff --git a/tests/support/judge.py b/tests/support/judge.py
new file mode 100644
index 0000000..7fa0bf4
--- /dev/null
+++ b/tests/support/judge.py
@@ -0,0 +1,8 @@
+from deepeval.models import GeminiModel
+
+from tests.config import GEMINI_API_KEY, GEMINI_MODEL
+
+
+def make_judge_model() -> GeminiModel:
+    """Gemini model used as the DeepEval judge for LLM-graded eval tests."""
+    return GeminiModel(model=GEMINI_MODEL, api_key=GEMINI_API_KEY or None, temperature=0)
diff --git a/tests/support/ollama.py b/tests/support/ollama.py
deleted file mode 100644
index 9862625..0000000
--- a/tests/support/ollama.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from deepeval.models import OllamaModel
-
-
-class NoThinkOllamaModel(OllamaModel):
-    """OllamaModel that disables thinking mode for reliable structured output."""
-
-    def generate(self, prompt, schema=None):
-        chat_model = self.load_model()
-        messages = [{"role": "user", "content": prompt}]
-        response = chat_model.chat(
-            model=self.name,
-            messages=messages,
-            format=schema.model_json_schema() if schema else None,
-            options={"temperature": self.temperature, "num_ctx": 32768},
-            think=False,
-        )
-        return (
-            (schema.model_validate_json(response.message.content) if schema else response.message.content),
-            0,
-        )
-
-    async def a_generate(self, prompt, schema=None):
-        chat_model = self.load_model(async_mode=True)
-        messages = [{"role": "user", "content": prompt}]
-        response = await chat_model.chat(
-            model=self.name,
-            messages=messages,
-            format=schema.model_json_schema() if schema else None,
-            options={"temperature": self.temperature, "num_ctx": 32768},
-            think=False,
-        )
-        return (
-            (schema.model_validate_json(response.message.content) if schema else response.message.content),
-            0,
-        )