Sentience-Robotics · MatthiasvonRakowski · May 7, 2026 · May 7, 2026 · May 11, 2026 · May 11, 2026
diff --git a/config/client_aux2.yaml b/config/client_aux2.yaml
@@ -0,0 +1,26 @@
+huri_url: ws://localhost:8000/session
+
+topic_list: ["transcript", "question", "rag_response"]
+sample_rate: 16000
+frame_duration: 0.030
+modules:
+  mic:
+    name: mic
+    args:
+      vad_agressiveness: 3
+      silence_duration: 1.5
+      block_duration: ${frame_duration}
+  stt:
+    name: stt
+    args:
+      language: "en"
+      block_duration: ${frame_duration}
+    logging: INFO
+  tag:
+    name: tag
+    logging: INFO
+  rag:
+    name: rag
+    args:
+      language: "en"
+      tone: "formal"
diff --git a/config/huri.yaml b/config/huri.yaml
@@ -11,10 +11,25 @@ logging_config:
   enable_access_log: true
   additional_log_standard_attrs: []
 
+services:
+  qdrant:
+    port: 6333
+    image: "qdrant/qdrant:latest"
+    storage_volume: "qdrant_data"
+  ollama:
+    model: "mistral:7b"
+    image: "ollama/ollama:rocm"
+    gpu_devices: true
+    num_replicas: 1
+
 applications:
   - name: huri-app
     route_prefix: /
     import_path: src.app:app
     runtime_env: { RAY_COLOR_PREFIX=1 }
     deployments:
       - name: HuRI
+      - name: RAGHandle
+        num_replicas: 2
+      - name: OllamaService
+      - name: QdrantService
diff --git a/src/app.py b/src/app.py
@@ -1,15 +1,48 @@
+from pathlib import Path
+
+import yaml
 from ray.serve import Application
 
 from src.core.huri import HuRI
 from src.modules.factory import bind_deployment_handles
 from src.modules.modules import get_modules
+from src.modules.rag.docker_services import OllamaService, QdrantService
+
+
+def load_services_config() -> dict:
+    config_path = Path(__file__).resolve().parents[1] / "config" / "huri.yaml"
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    return config.get("services", {})
+
+
+def build_qdrant(config: dict):
+    return QdrantService.bind(
+        port=config.get("port", 6333),
+        image=config.get("image", "qdrant/qdrant:latest"),
+        storage_volume=config.get("storage_volume", "qdrant_data"),
+    )
+
+
+def build_ollama(config: dict):
+    return OllamaService.options(
+        num_replicas=config.get("num_replicas", 1),
+    ).bind(
+        model=config.get("model", "mistral:7b"),
+        image=config.get("image", "ollama/ollama:latest"),
+        gpu_devices=config.get("gpu_devices", False),
+    )
 
 
 def build_app() -> Application:
     modules = get_modules()
-    handles = bind_deployment_handles(modules)
+    services_config = load_services_config()
+
+    qdrant = build_qdrant(services_config.get("qdrant", {}))
+    ollama = build_ollama(services_config.get("ollama", {}))
 
-    app: Application = HuRI.bind(modules, handles)  # type: ignore[attr-defined]
+    handles = bind_deployment_handles(modules, ollama=ollama, qdrant=qdrant)
+    app: Application = HuRI.bind(modules, handles)
     return app
 
 

diff --git a/src/modules/factory.py b/src/modules/factory.py
@@ -63,15 +63,24 @@ def create_from_config(
 
 def bind_deployment_handles(
     modules: Dict[str, Type[Module]],
+    **service_handles,
 ) -> Dict[str, handle.DeploymentHandle]:
     handles: Dict[str, handle.DeploymentHandle] = {}
     for name, module_cls in modules.items():
         if not issubclass(module_cls, ModuleWithHandle):
             continue
-
+    
         if not hasattr(module_cls, "_handle_cls"):
             raise TypeError(f"{module_cls.__name__} must define _handle_cls")
+
         handle_cls = module_cls._handle_cls
-        handles[name] = handle_cls.bind()
+
+        if name == "rag" and service_handles:
+            handles[name] = handle_cls.bind(
+                ollama_handle=service_handles.get("ollama"),
+                qdrant_handle=service_handles.get("qdrant"),
+            )
+        else:
+            handles[name] = handle_cls.bind()
 
     return handles
diff --git a/src/modules/rag/docker_services.py b/src/modules/rag/docker_services.py
@@ -0,0 +1,220 @@
+import time
+import socket
+import subprocess
+
+import httpx
+from ray import serve
+
+
+def find_free_port() -> int:
+    """
+    Ask the OS for a random free port.
+    We need this because if we run multiple Ollama containers,
+    they can't all use port 11434 — each needs its own.
+    """
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
+
+
+def wait_for_service(url: str, timeout: int = 120) -> bool:
+    """
+    Returns True if ready, False if timeout.
+    """
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            resp = httpx.get(url, timeout=5)
+            if resp.status_code == 200:
+                return True
+        except Exception:
+            pass
+        time.sleep(2)
+    return False
+
+
+def is_container_running(name: str) -> bool:
+    """Check if a Docker container with this name is already running."""
+    result = subprocess.run(
+        ["docker", "ps", "-q", "-f", f"name=^{name}$"],
+        capture_output=True, text=True,
+    )
+    return bool(result.stdout.strip())
+
+
+def remove_container(name: str):
+    """Force remove a container by name (ignores errors if it doesn't exist)."""
+    subprocess.run(["docker", "rm", "-f", name], capture_output=True)
+
+
+@serve.deployment
+class OllamaService:
+    """
+    Manages one Ollama Docker container.
+
+    LIFECYCLE:
+        __init__:  starts container -> waits for it -> pulls model
+        generate:  sends a prompt to the container, returns the answer
+        __del__:   stops and removes the container
+    """
+
+    def __init__(
+        self,
+        model: str = "mistral:7b",
+        image: str = "ollama/ollama:latest",
+        gpu_devices: bool = False,
+    ):
+        self.model = model
+        self.port = find_free_port()
+        self.container_name = f"ollama-ray-{self.port}"
+        self.base_url = f"http://localhost:{self.port}"
+
+        remove_container(self.container_name)
+
+        cmd = [
+            "docker", "run", "-d",
+            "--name", self.container_name,
+            "-p", f"{self.port}:11434",
+            "-v", "ollama_shared:/root/.ollama",
+        ]
+
+        if gpu_devices:
+            cmd.extend([
+                "--device=/dev/kfd",
+                "--device=/dev/dri",
+                "--group-add=video",
+            ])
+
+        cmd.append(image)
+
+        print(f"[OllamaService] Starting container '{self.container_name}' on port {self.port}...")
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"Docker failed: {result.stderr}")
+
+        print(f"[OllamaService] Waiting for Ollama to be ready...")
+        if not wait_for_service(f"{self.base_url}/api/tags"):
+            raise RuntimeError(f"Ollama didn't start within timeout on port {self.port}")
+
+        print(f"[OllamaService] Pulling model '{model}'...")
+        pull_result = subprocess.run(
+            ["docker", "exec", self.container_name, "ollama", "pull", model],
+            capture_output=True, text=True,
+        )
+        if pull_result.returncode != 0:
+            raise RuntimeError(f"Failed to pull model: {pull_result.stderr}")
+
+        print(f"[OllamaService] Ready! container='{self.container_name}', port={self.port}, model='{model}'")
+
+
+    async def generate(
+        self,
+        messages: list,
+        max_tokens: int = 1024,
+        temperature: float = 0.1,
+    ) -> str:
+        """
+        Send messages to Ollama and return the response.
+        This is what RAGHandle calls to get LLM answers.
+        """
+        async with httpx.AsyncClient(timeout=60.0) as client:
+            resp = await client.post(
+                f"{self.base_url}/api/chat",
+                json={
+                    "model": self.model,
+                    "messages": messages,
+                    "stream": False,
+                    "options": {
+                        "num_predict": max_tokens,
+                        "temperature": temperature,
+                    },
+                },
+            )
+            resp.raise_for_status()
+            return resp.json()["message"]["content"]
+
+    async def health(self) -> dict:
+        """Check if this Ollama instance is alive."""
+        try:
+            async with httpx.AsyncClient(timeout=5.0) as client:
+                resp = await client.get(f"{self.base_url}/api/tags")
+                return {"status": "ok", "port": self.port, "container": self.container_name}
+        except Exception as e:
+            return {"status": "error", "error": str(e)}
+
+    def __del__(self):
+        """Cleanup when Ray destroys this replica."""
+        print(f"[OllamaService] Removing container '{self.container_name}'")
+        remove_container(self.container_name)
+
+
+@serve.deployment(num_replicas=1)
+class QdrantService:
+    """
+    Manages a Qdrant Docker container.
+
+    LIFECYCLE:
+        __init__:  starts container (or reuses if already running)
+        get_url:   returns the URL other services should connect to
+        __del__:   leaves the container running (it has data!)
+    """
+
+    def __init__(
+        self,
+        port: int = 6333,
+        image: str = "qdrant/qdrant:latest",
+        storage_volume: str = "qdrant_data",
+    ):
+        self.port = port
+        self.container_name = "qdrant-ray"
+        self.url = f"http://localhost:{self.port}"
+
+        if self._is_healthy():
+            print(f"[QdrantService] Qdrant already running on port {self.port}")
+            return
+
+        remove_container(self.container_name)
+
+        cmd = [
+            "docker", "run", "-d",
+            "--name", self.container_name,
+            "-p", f"{self.port}:6333",
+            "-v", f"{storage_volume}:/qdrant/storage",
+            image,
+        ]
+
+        print(f"[QdrantService] Starting Qdrant on port {self.port}...")
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"Docker failed: {result.stderr}")
+
+        if not wait_for_service(f"{self.url}/healthz"):
+            raise RuntimeError(f"Qdrant didn't start within timeout on port {self.port}")
+
+        print(f"[QdrantService] Ready on port {self.port}")
+
+
+    def _is_healthy(self) -> bool:
+        try:
+            resp = httpx.get(f"{self.url}/healthz", timeout=3)
+            return resp.status_code == 200
+        except Exception:
+            return False
+
+
+    async def get_url(self) -> str:
+        """Return the URL. Called by RAGHandle to know where Qdrant is."""
+        return self.url
+
+
+    async def health(self) -> dict:
+        try:
+            async with httpx.AsyncClient(timeout=5.0) as client:
+                resp = await client.get(f"{self.url}/healthz")
+                return {"status": "ok", "port": self.port, "url": self.url}
+        except Exception as e:
+            return {"status": "error", "error": str(e)}
+
+
+    def __del__(self):
+        print(f"[QdrantService] Actor destroyed. Container '{self.container_name}' left running.")