From ceee927261261a3c65fe0cc471d6d5df9723a79c Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 17 Feb 2026 15:13:48 +0100
Subject: [PATCH 01/38] create base metric class

---
 .pre-commit-config.yaml           |  6 +++---
 scripts/evaluation/base_metric.py | 31 +++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 scripts/evaluation/base_metric.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 93a59e68..f99f0c60 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,15 +1,15 @@
 repos:
 -   repo: https://github.com/ambv/black
-    rev: 22.6.0
+    rev: 26.1.0
     hooks:
     - id: black
       language_version: python3.11
 -   repo: https://github.com/pycqa/flake8
-    rev: 5.0.4
+    rev: 7.3.0
     hooks:
     - id: flake8
 -   repo: https://github.com/PyCQA/docformatter
-    rev: v1.5.0
+    rev: v1.7.7
     hooks:
       - id: docformatter
         name: docformatter
diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
new file mode 100644
index 00000000..5f4d2cc0
--- /dev/null
+++ b/scripts/evaluation/base_metric.py
@@ -0,0 +1,31 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+from dialoguekit.core.dialogue import Dialogue
+
+
+class BaseMetric(ABC):
+    """Abstract base class for dialogue evaluation metrics."""
+
+    def __init__(self) -> None:
+        """Initialize the metric."""
+        pass
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Metric name (e.g., 'quality', 'satisfaction', 'utility')."""
+        pass
+
+    @abstractmethod
+    def compute(self, dialogues: list[Dialogue], **kwargs: Any) -> Any:
+        """Compute the metric over the given dialogues.
+
+        Args:
+            dialogues: List of dialogues to compute the metric on.
+            **kwargs: Additional arguments specific to the metric.
+
+        Returns:
+            Metric scores.
+        """
+        pass

From 46b74566e26b4410a8f2a50311591e530a8daaea Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 17 Feb 2026 15:46:27 +0100
Subject: [PATCH 02/38] return pre commit config versions

---
 .pre-commit-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f99f0c60..93a59e68 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,15 +1,15 @@
 repos:
 -   repo: https://github.com/ambv/black
-    rev: 26.1.0
+    rev: 22.6.0
     hooks:
     - id: black
       language_version: python3.11
 -   repo: https://github.com/pycqa/flake8
-    rev: 7.3.0
+    rev: 5.0.4
     hooks:
     - id: flake8
 -   repo: https://github.com/PyCQA/docformatter
-    rev: v1.7.7
+    rev: v1.5.0
     hooks:
       - id: docformatter
         name: docformatter

From 8c96457638f0c8fb1a20d31845c84508f4f2f86d Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Mon, 23 Feb 2026 20:51:02 +0100
Subject: [PATCH 03/38] #233 add new classes

---
 scripts/__init__.py                           |   1 +
 scripts/evaluation/__init__.py                |   3 +
 scripts/evaluation/quality_evaluation.py      | 102 +-----
 scripts/evaluation/quality_metric.py          | 208 ++++++++++++
 scripts/evaluation/satisfaction_evaluation.py |  19 +-
 scripts/evaluation/satisfaction_metric.py     |  73 +++++
 scripts/evaluation/utility_evaluation.py      | 245 +-------------
 scripts/evaluation/utility_metric.py          | 310 ++++++++++++++++++
 usersimcrs/nlu/llm/__init__.py                |  12 +-
 usersimcrs/utils/simulation_utils.py          |   9 +-
 10 files changed, 623 insertions(+), 359 deletions(-)
 create mode 100644 scripts/__init__.py
 create mode 100644 scripts/evaluation/__init__.py
 create mode 100644 scripts/evaluation/quality_metric.py
 create mode 100644 scripts/evaluation/satisfaction_metric.py
 create mode 100644 scripts/evaluation/utility_metric.py

diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 00000000..5100bd2d
--- /dev/null
+++ b/scripts/__init__.py
@@ -0,0 +1 @@
+"""Scripts package marker to avoid namespace package ambiguity for mypy."""
diff --git a/scripts/evaluation/__init__.py b/scripts/evaluation/__init__.py
new file mode 100644
index 00000000..ad40101c
--- /dev/null
+++ b/scripts/evaluation/__init__.py
@@ -0,0 +1,3 @@
+"""Evaluation helpers package to make imports explicit for type checking."""
+
+__all__: list[str] = []
diff --git a/scripts/evaluation/quality_evaluation.py b/scripts/evaluation/quality_evaluation.py
index 3901ef41..162e1d56 100644
--- a/scripts/evaluation/quality_evaluation.py
+++ b/scripts/evaluation/quality_evaluation.py
@@ -14,54 +14,12 @@
 import argparse
 import json
 import os
-from collections import defaultdict
-from dataclasses import dataclass
 from statistics import mean, stdev
-from typing import Dict, List, Union
+from typing import Dict, List
 
-from tqdm import tqdm
-
-from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.participant.participant import DialogueParticipant
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
-from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics
-from usersimcrs.llm_interfaces.ollama_interface import (
-    OllamaLLMInterface,
-)
-
-_PROMPT_EVAL_INTRO = (
-    "You are an evaluator and you need to judge how does the "
-    "ASSISTANT perform based on the following CONVERSATION HISTORY. Please "
-    "rate the ASSISTANT's performance based on the following GRADING RUBRIC.\n"
-    "\nCONVERSATION HISTORY:"
-)
-_PROMPT_EVAL_OUTPUT_FORMAT = (
-    'Your output need be a be in a JSON format as follows:\n{"score": '
-    '<score>, "score_explanation": <explanation>}\nDo not include '
-    "additional information.\n"
-)
-
-
-@dataclass
-class QualityScore:
-    conversation_id: str
-    score: int
-    explanation: str = ""
-
-    def to_dict(self) -> Dict[str, Union[int, str]]:
-        """Converts the score to a dictionary."""
-        return {
-            "conversation_id": self.conversation_id,
-            "score": self.score,
-            "score_explanation": self.explanation,
-        }
 
-
-class QualityScoreEncoder(json.JSONEncoder):
-    def default(self, o):
-        if isinstance(o, QualityScore):
-            return o.to_dict()
-        return super().default(o)
+from scripts.evaluation.quality_metric import QualityMetric, QualityScoreEncoder
 
 
 def parse_args() -> argparse.Namespace:
@@ -91,66 +49,14 @@ def parse_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
-def get_prompt(grading_rubric: QualityRubrics, dialogue: Dialogue) -> str:
-    """Prepares prompt given grading rubric and dialogue.
-
-    Args:
-        grading_rubric: Grading rubric for the aspect.
-        dialogue: Dialogue.
-
-    Returns:
-        Prompt comprising task definition, grading rubric, and dialogue.
-    """
-    prompt = _PROMPT_EVAL_INTRO
-
-    # Add dialogue history
-    for utterance in dialogue.utterances:
-        role = (
-            "USER"
-            if utterance.participant == DialogueParticipant.USER
-            else "ASSISTANT"
-        )
-        prompt += f"\n{role}: {utterance.text}"
-
-    prompt += f"\n\nGRADING RUBRIC:\n{grading_rubric.value}\n"
-    prompt += _PROMPT_EVAL_OUTPUT_FORMAT
-    return prompt
-
-
 if __name__ == "__main__":
     args = parse_args()
 
     # Load dialogues
     dialogues = json_to_dialogues(args.dialogues)
 
-    # Ollama interface
-    ollama_interface = OllamaLLMInterface(
-        args.ollama_config, default_response=""
-    )
-
-    # Evaluate dialogues
-    scores: Dict[str, Dict[str, List[QualityScore]]] = defaultdict(
-        lambda: defaultdict(list)
-    )
-
-    for dialogue in tqdm(dialogues):
-        for aspect in QualityRubrics:
-            prompt = get_prompt(aspect, dialogue)
-            response = ollama_interface.get_llm_api_response(prompt)
-            try:
-                response = response.replace("\\", "\\\\")
-                response_dict = json.loads(response)
-                score = QualityScore(
-                    conversation_id=dialogue.conversation_id,
-                    score=int(response_dict["score"]),
-                    explanation=response_dict["score_explanation"],
-                )
-                scores[dialogue.agent_id][aspect.name].append(score)
-            except Exception as e:
-                print(
-                    f"Failed to get score for {aspect} dialogue "
-                    f"{dialogue.conversation_id}: {e}\nResponse: {response}"
-                )
+    metric = QualityMetric(args.ollama_config)
+    scores: Dict[str, Dict[str, List]] = metric.compute(dialogues)
 
     # Save scores
     if args.output:
diff --git a/scripts/evaluation/quality_metric.py b/scripts/evaluation/quality_metric.py
new file mode 100644
index 00000000..298f1eb1
--- /dev/null
+++ b/scripts/evaluation/quality_metric.py
@@ -0,0 +1,208 @@
+"""Quality metric class implementation.
+
+Extracted from the original CLI script in `quality_evaluation.py`.
+"""
+
+from collections import defaultdict
+import json
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from tqdm import tqdm
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.participant.participant import DialogueParticipant
+
+from scripts.evaluation.base_metric import BaseMetric
+from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics
+from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface
+
+
+_PROMPT_EVAL_INTRO = (
+    "You are an evaluator and you need to judge how does the "
+    "ASSISTANT perform based on the following CONVERSATION HISTORY. Please "
+    "rate the ASSISTANT's performance based on the following GRADING RUBRIC.\n"
+    "\nCONVERSATION HISTORY:"
+)
+_PROMPT_EVAL_OUTPUT_FORMAT = (
+    'Your output need be a be in a JSON format as follows:\n{"score": '
+    '<score>, "score_explanation": <explanation>}\nDo not include '
+    "additional information.\n"
+)
+
+
+@dataclass
+class QualityScore:
+    conversation_id: str
+    score: int
+    explanation: str = ""
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "conversation_id": self.conversation_id,
+            "score": self.score,
+            "score_explanation": self.explanation,
+        }
+
+
+class QualityScoreEncoder(json.JSONEncoder):
+    def default(self, o):
+        if isinstance(o, QualityScore):
+            return o.to_dict()
+        return super().default(o)
+
+
+class QualityMetric(BaseMetric):
+    """Quality evaluation metric using an LLM backend.
+
+    The class wraps the prompt construction and LLM calls and returns the
+    same structure previously produced by the CLI script:
+
+    { agent_id: { aspect_name: [QualityScore, ...], ... }, ... }
+    """
+
+    def __init__(
+        self,
+        ollama_config_path: str,
+        default_response: str = "",
+        rubrics: Optional[List[QualityRubrics]] = None,
+    ) -> None:
+        super().__init__()
+        self.ollama_config_path = ollama_config_path
+        self.default_response = default_response
+        self.rubrics = rubrics or list(QualityRubrics)
+
+    @property
+    def name(self) -> str:
+        return "quality"
+
+    def _get_prompt(
+        self, grading_rubric: QualityRubrics, dialogue: Dialogue
+    ) -> str:
+        """Prepares prompt given grading rubric and dialogue.
+
+        Args:
+            grading_rubric: Grading rubric for the aspect.
+            dialogue: Dialogue.
+
+        Returns:
+            Prompt comprising task definition, grading rubric, and dialogue.
+        """
+        prompt = _PROMPT_EVAL_INTRO
+        for utterance in dialogue.utterances:
+            role = (
+                "USER"
+                if utterance.participant == DialogueParticipant.USER
+                else "ASSISTANT"
+            )
+            prompt += f"\n{role}: {utterance.text}"
+
+        prompt += f"\n\nGRADING RUBRIC:\n{grading_rubric.value}\n"
+        prompt += _PROMPT_EVAL_OUTPUT_FORMAT
+        return prompt
+
+    def compute(
+        self, dialogues: List[Dialogue], aspects: Optional[List[str]] = None
+    ) -> Dict[str, Dict[str, List[QualityScore]]]:
+        """Compute quality scores for provided dialogues.
+
+        Args:
+            dialogues: list of Dialogue objects
+            aspects: optional list of aspect names (strings) to evaluate
+
+        Returns:
+            Nested dict: agent_id -> aspect_name -> list[QualityScore]
+        """
+        ollama_interface = OllamaLLMInterface(
+            self.ollama_config_path, default_response=self.default_response
+        )
+
+        if aspects:
+            aspect_enums = [QualityRubrics[asp] for asp in aspects]
+        else:
+            aspect_enums = self.rubrics
+
+        scores: Dict[str, Dict[str, List[QualityScore]]] = defaultdict(
+            lambda: defaultdict(list)
+        )
+
+        for dialogue in tqdm(dialogues):
+            for aspect in aspect_enums:
+                prompt = self._get_prompt(aspect, dialogue)
+                response = ollama_interface.get_llm_api_response(prompt)
+                try:
+                    response = response.replace("\\", "\\\\")
+                    response_dict = json.loads(response)
+                    score = QualityScore(
+                        conversation_id=dialogue.conversation_id,
+                        score=int(response_dict["score"]),
+                        explanation=response_dict.get("score_explanation", ""),
+                    )
+                    scores[dialogue.agent_id][aspect.name].append(score)
+                except Exception:
+                    print(
+                        f"Failed to get score for {aspect} dialogue "
+                        f"{dialogue.conversation_id}: {response}"
+                    )
+
+        return scores
+
+
+class RecommendationRelevanceMetric(QualityMetric):
+    """Quality metric that evaluates only recommendation relevance."""
+
+    def __init__(self, ollama_config_path: str, default_response: str = ""):
+        super().__init__(ollama_config_path, default_response=default_response)
+        self.rubrics = [QualityRubrics.REC_RELEVANCE]
+
+    @property
+    def name(self) -> str:
+        return "quality.recommendation_relevance"
+
+
+class CommunicationStyleMetric(QualityMetric):
+    """Quality metric that evaluates communication style."""
+
+    def __init__(self, ollama_config_path: str, default_response: str = ""):
+        super().__init__(ollama_config_path, default_response=default_response)
+        self.rubrics = [QualityRubrics.COM_STYLE]
+
+    @property
+    def name(self) -> str:
+        return "quality.communication_style"
+
+
+class FluencyMetric(QualityMetric):
+    """Quality metric that evaluates fluency."""
+
+    def __init__(self, ollama_config_path: str, default_response: str = ""):
+        super().__init__(ollama_config_path, default_response=default_response)
+        self.rubrics = [QualityRubrics.FLUENCY]
+
+    @property
+    def name(self) -> str:
+        return "quality.fluency"
+
+
+class ConversationalFlowMetric(QualityMetric):
+    """Quality metric that evaluates conversational flow."""
+
+    def __init__(self, ollama_config_path: str, default_response: str = ""):
+        super().__init__(ollama_config_path, default_response=default_response)
+        self.rubrics = [QualityRubrics.CONV_FLOW]
+
+    @property
+    def name(self) -> str:
+        return "quality.conversational_flow"
+
+
+class OverallSatisfactionQualityMetric(QualityMetric):
+    """Quality metric that evaluates overall satisfaction aspect."""
+
+    def __init__(self, ollama_config_path: str, default_response: str = ""):
+        super().__init__(ollama_config_path, default_response=default_response)
+        self.rubrics = [QualityRubrics.OVERALL_SAT]
+
+    @property
+    def name(self) -> str:
+        return "quality.overall_satisfaction"
diff --git a/scripts/evaluation/satisfaction_evaluation.py b/scripts/evaluation/satisfaction_evaluation.py
index ea7dfb11..21fb8e00 100644
--- a/scripts/evaluation/satisfaction_evaluation.py
+++ b/scripts/evaluation/satisfaction_evaluation.py
@@ -5,14 +5,11 @@
 """
 
 import argparse
-from collections import defaultdict
 from statistics import mean, stdev
 from typing import Dict
 
-from dialoguekit.nlu.models.satisfaction_classifier import (
-    SatisfactionClassifierSVM,
-)
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
+from scripts.evaluation.satisfaction_metric import SatisfactionMetric
 
 
 def parse_args() -> argparse.Namespace:
@@ -38,18 +35,8 @@ def parse_args() -> argparse.Namespace:
     dialogues = json_to_dialogues(args.dialogues)
     print(f"Loaded {len(dialogues)} dialogues.")
 
-    # Satisfaction classifier
-    satisfaction_classifier = SatisfactionClassifierSVM()
-
-    # Evaluate dialogues
-    scores: Dict[str, Dict[int, float]] = defaultdict(dict)
-
-    for i, dialogue in enumerate(dialogues):
-        scores[dialogue.agent_id][
-            i
-        ] = satisfaction_classifier.classify_last_n_dialogue(
-            dialogue, last_n=None
-        )
+    metric = SatisfactionMetric()
+    scores: Dict[str, Dict[int, float]] = metric.compute(dialogues)
 
     # Summary
     for agent, agent_scores in scores.items():
diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py
new file mode 100644
index 00000000..8de9b8b1
--- /dev/null
+++ b/scripts/evaluation/satisfaction_metric.py
@@ -0,0 +1,73 @@
+"""Satisfaction metric class implementation.
+
+Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class.
+"""
+
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from dialoguekit.core.dialogue import Dialogue  # type: ignore
+    from dialoguekit.nlu.models.satisfaction_classifier import (
+        SatisfactionClassifierSVM,
+    )  # type: ignore
+else:
+    try:
+        from dialoguekit.core.dialogue import Dialogue
+        from dialoguekit.nlu.models.satisfaction_classifier import (
+            SatisfactionClassifierSVM,
+        )
+    except Exception:
+        Dialogue = Any
+        SatisfactionClassifierSVM = Any
+
+from scripts.evaluation.base_metric import BaseMetric
+
+
+class SatisfactionMetric(BaseMetric):
+    """Wraps the `SatisfactionClassifierSVM` to compute satisfaction scores.
+
+    Output format matches previous CLI script: { agent_id: { dialogue_index:
+    score, ... }, ... }
+    """
+
+    def __init__(self, classifier: Optional[SatisfactionClassifierSVM] = None):
+        super().__init__()
+        self.classifier = classifier or SatisfactionClassifierSVM()
+
+    @property
+    def name(self) -> str:
+        return "satisfaction"
+
+    def compute(self, dialogues: List[Dialogue]) -> Dict[str, Dict[int, float]]:
+        """Compute satisfaction scores for dialogues.
+
+        Matches the previous CLI output format: agent_id -> dialogue_index ->
+        score
+        """
+        scores: Dict[str, Dict[int, float]] = defaultdict(dict)
+        for i, dialogue in enumerate(dialogues):
+            scores[dialogue.agent_id][
+                i
+            ] = self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
+        return scores
+
+
+class SatisfactionAverageMetric(SatisfactionMetric):
+    """Aggregates satisfaction scores and returns average per agent."""
+
+    @property
+    def name(self) -> str:
+        return "satisfaction.average"
+
+    def compute(self, dialogues: List[Dialogue]) -> Dict[str, float]:
+        raw = super().compute(dialogues)
+        averages: Dict[str, float] = {}
+        for agent_id, agent_scores in raw.items():
+            if len(agent_scores) == 0:
+                averages[agent_id] = 0.0
+            else:
+                averages[agent_id] = sum(agent_scores.values()) / len(
+                    agent_scores
+                )
+        return averages
diff --git a/scripts/evaluation/utility_evaluation.py b/scripts/evaluation/utility_evaluation.py
index b97b2b50..ec898678 100644
--- a/scripts/evaluation/utility_evaluation.py
+++ b/scripts/evaluation/utility_evaluation.py
@@ -15,207 +15,10 @@
 """
 
 import argparse
-from collections import defaultdict
 import json
-from typing import Dict, List, Tuple
 
-from confuse import Configuration
-from tqdm import tqdm
-
-from dialoguekit.core.annotated_utterance import AnnotatedUtterance
-from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.core.intent import Intent
-from dialoguekit.nlu.nlu import NLU
-from dialoguekit.participant.participant import DialogueParticipant
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
-from usersimcrs.utils.simulation_utils import get_NLU
-
-
-def annotate_dialogue(
-    dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
-) -> Dialogue:
-    """Annotates utterances with dialogue acts.
-
-    Args:
-        dialogue: Dialogue to be annotated.
-        user_nlu: User NLU module.
-        agent_nlu: Agent NLU module.
-
-    Returns:
-        Annotated dialogue.
-    """
-    for i, utterance in enumerate(dialogue.utterances):
-        if not isinstance(utterance, AnnotatedUtterance):
-            dialogue.utterances[i] = AnnotatedUtterance.from_utterance(
-                utterance
-            )
-
-        if len(utterance.dialogue_acts) > 0:
-            continue
-
-        if utterance.participant == DialogueParticipant.USER:
-            dialogue.utterances[
-                i
-            ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance)
-        elif utterance.participant == DialogueParticipant.AGENT:
-            dialogue.utterances[
-                i
-            ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance)
-        else:
-            raise ValueError(f"Unknown participant: {utterance.participant}")
-    return dialogue
-
-
-def annotate_dialogues(
-    dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU
-) -> List[Dialogue]:
-    """Annotates dialogues with dialogue acts.
-
-    Args:
-        dialogues: Dialogues.
-        user_nlu: User NLU module.
-        agent_nlu: Agent NLU module.
-
-    Returns:
-        Annotated dialogues.
-    """
-    # TODO: Move this to DialogueKit
-    # See: https://github.com/iai-group/UserSimCRS/issues/219
-    return [
-        annotate_dialogue(dialogue, user_nlu, agent_nlu)
-        for dialogue in tqdm(dialogues)
-    ]
-
-
-def _get_recommendation_rounds(
-    dialogue: Dialogue, recommendation_intents: List[Intent]
-) -> List[List[AnnotatedUtterance]]:
-    """Gets utterances per recommendation round.
-
-    Args:
-        dialogue: Dialogue.
-        recommendation_intents: Intents corresponding to recommendation.
-
-    Returns:
-        Utterances per recommendation round.
-    """
-    rounds = []
-    current_round: List[AnnotatedUtterance] = []
-    for utterance in dialogue.utterances:
-        if any(
-            intent in utterance.get_intents()
-            for intent in recommendation_intents
-        ):
-            if current_round:
-                rounds.append(current_round)
-            current_round = [utterance]
-        else:
-            current_round.append(utterance)
-    return rounds
-
-
-def _is_recommendation_accepted(
-    round: List[AnnotatedUtterance],
-    acceptance_intents: List[Intent],
-    rejection_intents: List[Intent],
-) -> bool:
-    """Assesses whether the recommendation was accepted.
-
-    Args:
-        round: Utterances in recommendation round.
-        acceptance_intents: Intents corresponding to acceptance.
-        rejection_intents: Intents corresponding to rejection.
-
-    Returns:
-        True if the recommendation was accepted, False otherwise.
-    """
-    b_accepted = False
-    for utterance in round:
-        if utterance.participant == DialogueParticipant.USER:
-            intents = utterance.get_intents()
-            if any(intent in acceptance_intents for intent in intents):
-                b_accepted = True
-            elif any(intent in rejection_intents for intent in intents):
-                return False
-    return b_accepted
-
-
-def assess_dialogue(
-    dialogue: Dialogue,
-    recommendation_intents: List[Intent],
-    acceptance_intents: List[Intent],
-    rejection_intents: List[Intent],
-) -> Tuple[int, int, int]:
-    """Assesses the utility of the dialogue.
-
-    Args:
-        dialogue: Dialogue.
-        recommendation_intents: Intents corresponding to recommendation.
-        acceptance_intents: Intents corresponding to acceptance.
-        rejection_intents: Intents corresponding to rejection.
-
-    Returns:
-        Tuple of number of accepted recommendations, successful recommendation
-          rounds and total recommendation rounds.
-    """
-    # TODO: Optimize overall assessment to avoid multiple iterations over
-    # utterances.
-    rounds = _get_recommendation_rounds(dialogue, recommendation_intents)
-    successful_rounds = 0
-    for round in rounds:
-        if _is_recommendation_accepted(
-            round, acceptance_intents, rejection_intents
-        ):
-            successful_rounds += 1
-
-    nb_accepted_recommendations = sum(
-        1
-        for utterance in dialogue.utterances
-        if utterance.participant == DialogueParticipant.USER
-        and any(
-            intent in acceptance_intents for intent in utterance.get_intents()
-        )
-    )
-    return nb_accepted_recommendations, successful_rounds, len(rounds)
-
-
-def get_summary(dialogues: List[Dialogue]) -> None:
-    """Displays a summary of the utility evaluation.
-
-    Args:
-        dialogues: Dialogues.
-    """
-    summary: Dict[str, Dict[str, float]] = defaultdict(
-        lambda: {
-            "total_dialogues": 0,
-            "success_rate": 0,
-            "srrr": 0,
-            "rdl": 0,
-        }
-    )
-    for dialogue in dialogues:
-        summary[dialogue.agent_id]["total_dialogues"] += 1
-        summary[dialogue.agent_id]["success_rate"] += dialogue.metadata[
-            "utility"
-        ]["success"]
-        summary[dialogue.agent_id]["srrr"] += dialogue.metadata["utility"][
-            "successful_recommendation_round_ratio"
-        ]
-        summary[dialogue.agent_id]["rdl"] += dialogue.metadata["utility"][
-            "reward_per_dialogue_length"
-        ]
-
-    for agent_id, stats in summary.items():
-        total = stats["total_dialogues"]
-        print(f"Agent: {agent_id}")
-        print(f"\tTotal Dialogues: {total}")
-        print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}")
-        print(
-            "\tSuccessful Recommendation Round Ratio: "
-            f"{stats['srrr'] / total:.4f}"
-        )
-        print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}")
-        print()
+from scripts.evaluation.utility_metric import UtilityMetric
 
 
 def parse_args() -> argparse.Namespace:
@@ -271,43 +74,13 @@ def parse_args() -> argparse.Namespace:
 
     dialogues = json_to_dialogues(args.annotated_dialogues)
 
-    rejection_intents = [Intent(label) for label in args.reject_intent_labels]
-    acceptance_intents = [Intent(label) for label in args.accept_intent_labels]
-    recommendation_intents = [
-        Intent(label) for label in args.recommendation_intent_labels
-    ]
-
-    # NLU module for user utterances
-    user_nlu_config = Configuration("User NLU Configuration")
-    user_nlu_config.set_file(args.user_nlu_config)
-    user_nlu = get_NLU(user_nlu_config)
-
-    # NLU module for agent utterances
-    agent_nlu_config = Configuration("Agent NLU Configuration")
-    agent_nlu_config.set_file(args.agent_nlu_config)
-    agent_nlu = get_NLU(agent_nlu_config)
-
-    dialogues = annotate_dialogues(dialogues, user_nlu, agent_nlu)
-    for dialogue in dialogues:
-        (
-            nb_accepted_recommendations,
-            successful_rounds,
-            total_rounds,
-        ) = assess_dialogue(
-            dialogue,
-            recommendation_intents,
-            acceptance_intents,
-            rejection_intents,
-        )
-        dialogue.metadata["utility"] = {
-            "success": int(successful_rounds > 0),
-            "successful_recommendation_round_ratio": (
-                successful_rounds / total_rounds if total_rounds > 0 else 0.0
-            ),
-            "reward_per_dialogue_length": (
-                nb_accepted_recommendations / len(dialogue.utterances)
-            ),
-        }
+    metric = UtilityMetric(args.user_nlu_config, args.agent_nlu_config)
+    dialogues = metric.compute(
+        dialogues,
+        recommendation_intent_labels=args.recommendation_intent_labels,
+        acceptance_intent_labels=args.accept_intent_labels,
+        rejection_intent_labels=args.reject_intent_labels,
+    )
 
     if args.output:
         with open(args.output, "w") as f:
@@ -315,4 +88,4 @@ def parse_args() -> argparse.Namespace:
                 [dialogue.to_dict() for dialogue in dialogues], f, indent=2
             )
 
-    get_summary(dialogues)
+    metric.get_summary(dialogues)
diff --git a/scripts/evaluation/utility_metric.py b/scripts/evaluation/utility_metric.py
new file mode 100644
index 00000000..f59d92e6
--- /dev/null
+++ b/scripts/evaluation/utility_metric.py
@@ -0,0 +1,310 @@
+"""Utility metric class implementation.
+
+Encapsulates the logic from `utility_evaluation.py` into a `BaseMetric`.
+"""
+
+from collections import defaultdict
+from typing import Dict, List, Tuple
+
+from confuse import Configuration
+
+from dialoguekit.core.annotated_utterance import AnnotatedUtterance
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
+from dialoguekit.nlu.nlu import NLU
+from dialoguekit.participant.participant import DialogueParticipant
+from usersimcrs.utils.simulation_utils import get_NLU
+from scripts.evaluation.base_metric import BaseMetric
+
+
+def annotate_dialogue(
+    dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
+) -> Dialogue:
+    """Annotates utterances with dialogue acts.
+
+    Args:
+        dialogue: Dialogue to be annotated.
+        user_nlu: User NLU module.
+        agent_nlu: Agent NLU module.
+
+    Returns:
+        Annotated dialogue.
+    """
+    for i, utterance in enumerate(dialogue.utterances):
+        if not isinstance(utterance, AnnotatedUtterance):
+            dialogue.utterances[i] = AnnotatedUtterance.from_utterance(
+                utterance
+            )
+
+        if len(utterance.dialogue_acts) > 0:
+            continue
+
+        if utterance.participant == DialogueParticipant.USER:
+            dialogue.utterances[
+                i
+            ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance)
+        elif utterance.participant == DialogueParticipant.AGENT:
+            dialogue.utterances[
+                i
+            ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance)
+        else:
+            raise ValueError(f"Unknown participant: {utterance.participant}")
+    return dialogue
+
+
+def annotate_dialogues(
+    dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU
+) -> List[Dialogue]:
+    """Annotates dialogues with dialogue acts.
+
+    Args:
+        dialogues: Dialogues.
+        user_nlu: User NLU module.
+        agent_nlu: Agent NLU module.
+
+    Returns:
+        Annotated dialogues.
+    """
+    # TODO: Move this to DialogueKit
+    # See: https://github.com/iai-group/UserSimCRS/issues/219
+    return [
+        annotate_dialogue(dialogue, user_nlu, agent_nlu)
+        for dialogue in dialogues
+    ]
+
+
+def _get_recommendation_rounds(
+    dialogue: Dialogue, recommendation_intents: List[Intent]
+) -> List[List[AnnotatedUtterance]]:
+    rounds: List[List[AnnotatedUtterance]] = []
+    current_round: List[AnnotatedUtterance] = []
+    for utterance in dialogue.utterances:
+        if any(
+            intent in utterance.get_intents()
+            for intent in recommendation_intents
+        ):
+            if current_round:
+                rounds.append(current_round)
+            current_round = [utterance]
+        else:
+            current_round.append(utterance)
+    return rounds
+
+
+def _is_recommendation_accepted(
+    round: List[AnnotatedUtterance],
+    acceptance_intents: List[Intent],
+    rejection_intents: List[Intent],
+) -> bool:
+    b_accepted = False
+    for utterance in round:
+        if utterance.participant == DialogueParticipant.USER:
+            intents = utterance.get_intents()
+            if any(intent in acceptance_intents for intent in intents):
+                b_accepted = True
+            elif any(intent in rejection_intents for intent in intents):
+                return False
+    return b_accepted
+
+
+def assess_dialogue(
+    dialogue: Dialogue,
+    recommendation_intents: List[Intent],
+    acceptance_intents: List[Intent],
+    rejection_intents: List[Intent],
+) -> Tuple[int, int, int]:
+    """Assesses the utility of the dialogue.
+
+    Args:
+        dialogue: Dialogue.
+        recommendation_intents: Intents corresponding to recommendation.
+        acceptance_intents: Intents corresponding to acceptance.
+        rejection_intents: Intents corresponding to rejection.
+
+    Returns:
+        Tuple of number of accepted recommendations, successful recommendation
+          rounds and total recommendation rounds.
+    """
+    # TODO: Optimize overall assessment to avoid multiple iterations over
+    # utterances.
+    rounds = _get_recommendation_rounds(dialogue, recommendation_intents)
+    successful_rounds = 0
+    for round in rounds:
+        if _is_recommendation_accepted(
+            round, acceptance_intents, rejection_intents
+        ):
+            successful_rounds += 1
+
+    nb_accepted_recommendations = sum(
+        1
+        for utterance in dialogue.utterances
+        if utterance.participant == DialogueParticipant.USER
+        and any(
+            intent in acceptance_intents for intent in utterance.get_intents()
+        )
+    )
+    return nb_accepted_recommendations, successful_rounds, len(rounds)
+
+
+class UtilityMetric(BaseMetric):
+    """Computes utility metrics for dialogues.
+
+    Constructor takes paths to user and agent NLU configuration files.
+    """
+
+    def __init__(self, user_nlu_config_path: str, agent_nlu_config_path: str):
+        super().__init__()
+        self.user_nlu_config_path = user_nlu_config_path
+        self.agent_nlu_config_path = agent_nlu_config_path
+
+    @property
+    def name(self) -> str:
+        return "utility"
+
+    def _load_nlus(self) -> Tuple[NLU, NLU]:
+        user_nlu_config = Configuration("User NLU Configuration")
+        user_nlu_config.set_file(self.user_nlu_config_path)
+        user_nlu = get_NLU(user_nlu_config)
+
+        agent_nlu_config = Configuration("Agent NLU Configuration")
+        agent_nlu_config.set_file(self.agent_nlu_config_path)
+        agent_nlu = get_NLU(agent_nlu_config)
+
+        return user_nlu, agent_nlu
+
+    def compute(
+        self,
+        dialogues: List[Dialogue],
+        recommendation_intent_labels: List[str] = ["REC-S", "REC-E"],
+        acceptance_intent_labels: List[str] = ["ACC"],
+        rejection_intent_labels: List[str] = ["REJ"],
+    ) -> List[Dialogue]:
+        user_nlu, agent_nlu = self._load_nlus()
+
+        dialogues = annotate_dialogues(dialogues, user_nlu, agent_nlu)
+
+        recommendation_intents = [
+            Intent(label) for label in recommendation_intent_labels
+        ]
+        acceptance_intents = [
+            Intent(label) for label in acceptance_intent_labels
+        ]
+        rejection_intents = [Intent(label) for label in rejection_intent_labels]
+
+        for dialogue in dialogues:
+            (
+                nb_accepted_recommendations,
+                successful_rounds,
+                total_rounds,
+            ) = assess_dialogue(
+                dialogue,
+                recommendation_intents,
+                acceptance_intents,
+                rejection_intents,
+            )
+            dialogue.metadata["utility"] = {
+                "success": int(successful_rounds > 0),
+                "successful_recommendation_round_ratio": (
+                    successful_rounds / total_rounds
+                    if total_rounds > 0
+                    else 0.0
+                ),
+                "reward_per_dialogue_length": (
+                    nb_accepted_recommendations / len(dialogue.utterances)
+                    if len(dialogue.utterances) > 0
+                    else 0.0
+                ),
+            }
+
+        return dialogues
+
+    def get_summary(self, dialogues: List[Dialogue]) -> None:
+        summary: Dict[str, Dict[str, float]] = defaultdict(
+            lambda: {
+                "total_dialogues": 0,
+                "success_rate": 0,
+                "srrr": 0,
+                "rdl": 0,
+            }
+        )
+        for dialogue in dialogues:
+            summary[dialogue.agent_id]["total_dialogues"] += 1
+            summary[dialogue.agent_id]["success_rate"] += dialogue.metadata[
+                "utility"
+            ]["success"]
+            summary[dialogue.agent_id]["srrr"] += dialogue.metadata["utility"][
+                "successful_recommendation_round_ratio"
+            ]
+            summary[dialogue.agent_id]["rdl"] += dialogue.metadata["utility"][
+                "reward_per_dialogue_length"
+            ]
+
+        for agent_id, stats in summary.items():
+            total = stats["total_dialogues"]
+            print(f"Agent: {agent_id}")
+            print(f"\tTotal Dialogues: {total}")
+            print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}")
+            print(
+                "\tSuccessful Recommendation Round Ratio: "
+                f"{stats['srrr'] / total:.4f}"
+            )
+            print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}")
+            print()
+
+
+class UtilitySuccessMetric(UtilityMetric):
+    """Extracts per-dialogue success flag from utility analysis."""
+
+    @property
+    def name(self) -> str:
+        return "utility.success"
+
+    def compute(self, dialogues: List[Dialogue], *args, **kwargs):
+        dialogues = super().compute(dialogues, *args, **kwargs)
+
+        results: Dict[str, Dict[int, int]] = defaultdict(dict)
+        for i, dialogue in enumerate(dialogues):
+            results[dialogue.agent_id][i] = int(
+                dialogue.metadata.get("utility", {}).get("success", 0)
+            )
+        return results
+
+
+class UtilitySRRRMetric(UtilityMetric):
+    """Extracts successful recommendation round ratio per dialogue."""
+
+    @property
+    def name(self) -> str:
+        return "utility.successful_recommendation_round_ratio"
+
+    def compute(self, dialogues: List[Dialogue], *args, **kwargs):
+        dialogues = super().compute(dialogues, *args, **kwargs)
+
+        results: Dict[str, Dict[int, float]] = defaultdict(dict)
+        for i, dialogue in enumerate(dialogues):
+            results[dialogue.agent_id][i] = float(
+                dialogue.metadata.get("utility", {}).get(
+                    "successful_recommendation_round_ratio", 0.0
+                )
+            )
+        return results
+
+
+class UtilityRDLMetric(UtilityMetric):
+    """Extracts reward-per-dialogue-length per dialogue."""
+
+    @property
+    def name(self) -> str:
+        return "utility.reward_per_dialogue_length"
+
+    def compute(self, dialogues: List[Dialogue], *args, **kwargs):
+        dialogues = super().compute(dialogues, *args, **kwargs)
+
+        results: Dict[str, Dict[int, float]] = defaultdict(dict)
+        for i, dialogue in enumerate(dialogues):
+            results[dialogue.agent_id][i] = float(
+                dialogue.metadata.get("utility", {}).get(
+                    "reward_per_dialogue_length", 0.0
+                )
+            )
+        return results
diff --git a/usersimcrs/nlu/llm/__init__.py b/usersimcrs/nlu/llm/__init__.py
index be592d99..3c608547 100644
--- a/usersimcrs/nlu/llm/__init__.py
+++ b/usersimcrs/nlu/llm/__init__.py
@@ -1,9 +1,9 @@
 """Module level init for LLM-based NLU components."""
 
-from usersimcrs.nlu.llm.llm_dialogue_act_extractor import (
-    LLMDialogueActsExtractor,
-)
+"""Module level init for LLM-based NLU components.
 
-__all__ = [
-    "LLMDialogueActsExtractor",
-]
+Avoid importing heavy submodules at package import time to keep test
+collection lightweight; import submodules explicitly when needed.
+"""
+
+__all__ = ["LLMDialogueActsExtractor"]
diff --git a/usersimcrs/utils/simulation_utils.py b/usersimcrs/utils/simulation_utils.py
index 6121723e..b0ed0c9f 100644
--- a/usersimcrs/utils/simulation_utils.py
+++ b/usersimcrs/utils/simulation_utils.py
@@ -142,9 +142,12 @@ def _get_agenda_based_simulator_config(
 
     ratings = Ratings(item_collection)
     ratings.load_ratings_csv(file_path=config["ratings"].get())
-    historical_ratings, _ = ratings.create_split(
-        config["historical_ratings_ratio"].get(0.8)
-    )
+    raw = config["historical_ratings_ratio"].get()
+    if raw is None:
+        historical_ratio = 0.8
+    else:
+        historical_ratio = float(raw)
+    historical_ratings, _ = ratings.create_split(historical_ratio)
 
     preference_model = SimplePreferenceModel(
         domain,

From 950964c1533d96f05282536b6c868d641a72ee7f Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Mon, 23 Feb 2026 22:55:25 +0100
Subject: [PATCH 04/38] #232 fix class

---
 scripts/evaluation/base_metric.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
index 5f4d2cc0..71edfac0 100644
--- a/scripts/evaluation/base_metric.py
+++ b/scripts/evaluation/base_metric.py
@@ -1,31 +1,35 @@
+"""Abstract base class for dialogue evaluation metrics."""
+
 from abc import ABC, abstractmethod
-from typing import Any
+from typing import Any, List
 
 from dialoguekit.core.dialogue import Dialogue
 
 
 class BaseMetric(ABC):
-    """Abstract base class for dialogue evaluation metrics."""
+    def __init__(self, name: str) -> None:
+        """Initializes the metric.
 
-    def __init__(self) -> None:
-        """Initialize the metric."""
-        pass
+        Args:
+            name: Metric name (e.g., 'quality', 'satisfaction', 'utility').
+        """
+        super().__init__()
+        self._name = name
 
     @property
-    @abstractmethod
     def name(self) -> str:
         """Metric name (e.g., 'quality', 'satisfaction', 'utility')."""
-        pass
+        return self._name
 
     @abstractmethod
-    def compute(self, dialogues: list[Dialogue], **kwargs: Any) -> Any:
-        """Compute the metric over the given dialogues.
+    def compute(self, dialogues: List[Dialogue], **kwargs: Any) -> Any:
+        """Computes the metric over the given dialogues.
 
         Args:
             dialogues: List of dialogues to compute the metric on.
             **kwargs: Additional arguments specific to the metric.
 
         Returns:
-            Metric scores.
+            Metric result; shape is defined by the concrete metric.
         """
-        pass
+        raise NotImplementedError()

From 33624b20890726e2b1b097eb7a232659bb5d5d7e Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 24 Feb 2026 10:24:24 +0100
Subject: [PATCH 05/38] #232 fix class

---
 scripts/evaluation/base_metric.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
index 71edfac0..4bd6ee4d 100644
--- a/scripts/evaluation/base_metric.py
+++ b/scripts/evaluation/base_metric.py
@@ -7,19 +7,13 @@
 
 
 class BaseMetric(ABC):
-    def __init__(self, name: str) -> None:
+    def __init__(self, name: str):
         """Initializes the metric.
 
         Args:
             name: Metric name (e.g., 'quality', 'satisfaction', 'utility').
         """
-        super().__init__()
-        self._name = name
-
-    @property
-    def name(self) -> str:
-        """Metric name (e.g., 'quality', 'satisfaction', 'utility')."""
-        return self._name
+        self.name = name
 
     @abstractmethod
     def compute(self, dialogues: List[Dialogue], **kwargs: Any) -> Any:

From 27f788866d0c5346e79be549b3123834011d7144 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 24 Feb 2026 11:25:50 +0100
Subject: [PATCH 06/38] #232 add aggregation

---
 scripts/evaluation/base_metric.py | 61 ++++++++++++++++++++++++++-----
 1 file changed, 52 insertions(+), 9 deletions(-)

diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
index 4bd6ee4d..823f33c0 100644
--- a/scripts/evaluation/base_metric.py
+++ b/scripts/evaluation/base_metric.py
@@ -1,29 +1,72 @@
-"""Abstract base class for dialogue evaluation metrics."""
+"""Abstract base class for dialogue evaluation metrics.
 
-from abc import ABC, abstractmethod
-from typing import Any, List
+Subclasses implement only compute_score(dialogue, **kwargs). The base class
+provides aggregation at three levels: per dialogue, per dialogues, and per
+agent.
+"""
 
+from abc import ABC, abstractmethod
+from typing import Any, Dict
 from dialoguekit.core.dialogue import Dialogue
 
 
 class BaseMetric(ABC):
-    def __init__(self, name: str):
+    def __init__(self, name: str) -> None:
         """Initializes the metric.
 
         Args:
-            name: Metric name (e.g., 'quality', 'satisfaction', 'utility').
+            name: Metric name.
         """
         self.name = name
 
     @abstractmethod
-    def compute(self, dialogues: List[Dialogue], **kwargs: Any) -> Any:
-        """Computes the metric over the given dialogues.
+    def compute_score(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the metric for a single dialogue.
+
+        Subclasses must implement this method.
 
         Args:
-            dialogues: List of dialogues to compute the metric on.
+            dialogue: Single dialogue to score.
             **kwargs: Additional arguments specific to the metric.
 
         Returns:
-            Metric result; shape is defined by the concrete metric.
+            Score for the dialogue.
+
+        Raises:
+            NotImplementedError: When not implemented by a subclass.
         """
         raise NotImplementedError()
+
+    def compute_scores_for_dialogues(
+        self, dialogues: Dict[str, Dialogue], **kwargs: Any
+    ) -> Dict[str, float]:
+        """Computes the metric for each dialogue in a dict of dialogues.
+
+        Args:
+            dialogues: Dict of dialogues
+            **kwargs: Passed through to compute_score.
+
+        Returns:
+            Dict of scores per dialogue.
+        """
+        return {
+            dialog_id: self.compute_score(dialogue, **kwargs)
+            for dialog_id, dialogue in dialogues.items()
+        }
+
+    def compute_scores_per_agent(
+        self, dialogues_by_agent: Dict[str, Dict[str, Dialogue]], **kwargs: Any
+    ) -> Dict[str, Dict[str, float]]:
+        """Computes the metric per agent over their dialogues.
+
+        Args:
+            dialogues_by_agent: Dict of dialogues per agent.
+            **kwargs: Passed through to compute_score.
+
+        Returns:
+            Dict of scores per agent.
+        """
+        return {
+            agent_id: self.compute_scores_for_dialogues(dialogues, **kwargs)
+            for agent_id, dialogues in dialogues_by_agent.items()
+        }

From c38331a10cf5be336f8b30ea345f5c1addd91862 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 24 Feb 2026 13:17:33 +0100
Subject: [PATCH 07/38] #232 fix methods

---
 scripts/evaluation/base_metric.py | 55 +++++++++++++++----------------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
index 823f33c0..0ba09c61 100644
--- a/scripts/evaluation/base_metric.py
+++ b/scripts/evaluation/base_metric.py
@@ -1,12 +1,8 @@
-"""Abstract base class for dialogue evaluation metrics.
-
-Subclasses implement only compute_score(dialogue, **kwargs). The base class
-provides aggregation at three levels: per dialogue, per dialogues, and per
-agent.
-"""
+"""Abstract base class for dialogue evaluation metrics."""
 
 from abc import ABC, abstractmethod
-from typing import Any, Dict
+from collections import defaultdict
+from typing import Any, Dict, List
 from dialoguekit.core.dialogue import Dialogue
 
 
@@ -20,53 +16,54 @@ def __init__(self, name: str) -> None:
         self.name = name
 
     @abstractmethod
-    def compute_score(self, dialogue: Dialogue, **kwargs: Any) -> float:
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
         """Computes the metric for a single dialogue.
 
-        Subclasses must implement this method.
-
         Args:
             dialogue: Single dialogue to score.
             **kwargs: Additional arguments specific to the metric.
 
-        Returns:
-            Score for the dialogue.
-
         Raises:
             NotImplementedError: When not implemented by a subclass.
+
+        Returns:
+            Score for the dialogue.
         """
         raise NotImplementedError()
 
-    def compute_scores_for_dialogues(
-        self, dialogues: Dict[str, Dialogue], **kwargs: Any
+    def evaluate_dialogues(
+        self, dialogues: List[Dialogue], **kwargs: Any
     ) -> Dict[str, float]:
-        """Computes the metric for each dialogue in a dict of dialogues.
+        """Computes the metric for every dialogue in a given list.
 
         Args:
-            dialogues: Dict of dialogues
-            **kwargs: Passed through to compute_score.
+            dialogues: Dialogues.
+            **kwargs: Additional arguments specific to the metric.
 
         Returns:
-            Dict of scores per dialogue.
+            Dictionary with result per dialogue.
         """
         return {
-            dialog_id: self.compute_score(dialogue, **kwargs)
-            for dialog_id, dialogue in dialogues.items()
+            dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs)
+            for dialogue in dialogues
         }
 
-    def compute_scores_per_agent(
-        self, dialogues_by_agent: Dict[str, Dict[str, Dialogue]], **kwargs: Any
+    def evaluate_agents(
+        self, dialogues: List[Dialogue], **kwargs: Any
     ) -> Dict[str, Dict[str, float]]:
-        """Computes the metric per agent over their dialogues.
+        """Computes the metric for every agent in a given list.
 
         Args:
-            dialogues_by_agent: Dict of dialogues per agent.
-            **kwargs: Passed through to compute_score.
+            dialogues: Dialogues.
+            **kwargs: Additional arguments specific to the metric.
 
         Returns:
-            Dict of scores per agent.
+            Dictionary with result per agent.
         """
+        dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list)
+        for dialogue in dialogues:
+            dialogues_by_agent[dialogue.agent_id].append(dialogue)
         return {
-            agent_id: self.compute_scores_for_dialogues(dialogues, **kwargs)
-            for agent_id, dialogues in dialogues_by_agent.items()
+            agent_id: self.evaluate_dialogues(agent_dialogues, **kwargs)
+            for agent_id, agent_dialogues in dialogues_by_agent.items()
         }

From 901dd5a7dbd96e34fd4bccfa840d50c634a22d10 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 24 Feb 2026 13:40:00 +0100
Subject: [PATCH 08/38] #232 fix nits

---
 scripts/evaluation/base_metric.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
index 0ba09c61..f08a3a05 100644
--- a/scripts/evaluation/base_metric.py
+++ b/scripts/evaluation/base_metric.py
@@ -6,7 +6,7 @@
 from dialoguekit.core.dialogue import Dialogue
 
 
-class BaseMetric(ABC):
+class Metric(ABC):
     def __init__(self, name: str) -> None:
         """Initializes the metric.
 
@@ -51,7 +51,7 @@ def evaluate_dialogues(
     def evaluate_agents(
         self, dialogues: List[Dialogue], **kwargs: Any
     ) -> Dict[str, Dict[str, float]]:
-        """Computes the metric for every agent in a given list.
+        """Computes the metric for every agent over their dialogues.
 
         Args:
             dialogues: Dialogues.

From c4efd7994bd7ad44e68f0a2449a96b8f398c748c Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 11:30:22 +0100
Subject: [PATCH 09/38] improvement/233-create-classes-for-metrics add classes

---
 scripts/evaluation/base_metric.py             |  58 +-
 scripts/evaluation/quality_evaluation.py      |  20 +-
 scripts/evaluation/quality_metric.py          | 174 ++----
 scripts/evaluation/satisfaction_evaluation.py |  12 +-
 scripts/evaluation/satisfaction_metric.py     | 103 ++--
 scripts/evaluation/utility_evaluation.py      |  45 +-
 scripts/evaluation/utility_metric.py          | 542 +++++++++---------
 tests/evaluation/test_quality_metric.py       |  83 +++
 tests/evaluation/test_satisfaction_metric.py  |  68 +++
 tests/evaluation/test_utility_metric.py       |  77 +++
 10 files changed, 673 insertions(+), 509 deletions(-)
 create mode 100644 tests/evaluation/test_quality_metric.py
 create mode 100644 tests/evaluation/test_satisfaction_metric.py
 create mode 100644 tests/evaluation/test_utility_metric.py

diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
index 4bd6ee4d..f08a3a05 100644
--- a/scripts/evaluation/base_metric.py
+++ b/scripts/evaluation/base_metric.py
@@ -1,29 +1,69 @@
 """Abstract base class for dialogue evaluation metrics."""
 
 from abc import ABC, abstractmethod
-from typing import Any, List
-
+from collections import defaultdict
+from typing import Any, Dict, List
 from dialoguekit.core.dialogue import Dialogue
 
 
-class BaseMetric(ABC):
-    def __init__(self, name: str):
+class Metric(ABC):
+    def __init__(self, name: str) -> None:
         """Initializes the metric.
 
         Args:
-            name: Metric name (e.g., 'quality', 'satisfaction', 'utility').
+            name: Metric name.
         """
         self.name = name
 
     @abstractmethod
-    def compute(self, dialogues: List[Dialogue], **kwargs: Any) -> Any:
-        """Computes the metric over the given dialogues.
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the metric for a single dialogue.
 
         Args:
-            dialogues: List of dialogues to compute the metric on.
+            dialogue: Single dialogue to score.
             **kwargs: Additional arguments specific to the metric.
 
+        Raises:
+            NotImplementedError: When not implemented by a subclass.
+
         Returns:
-            Metric result; shape is defined by the concrete metric.
+            Score for the dialogue.
         """
         raise NotImplementedError()
+
+    def evaluate_dialogues(
+        self, dialogues: List[Dialogue], **kwargs: Any
+    ) -> Dict[str, float]:
+        """Computes the metric for every dialogue in a given list.
+
+        Args:
+            dialogues: Dialogues.
+            **kwargs: Additional arguments specific to the metric.
+
+        Returns:
+            Dictionary with result per dialogue.
+        """
+        return {
+            dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs)
+            for dialogue in dialogues
+        }
+
+    def evaluate_agents(
+        self, dialogues: List[Dialogue], **kwargs: Any
+    ) -> Dict[str, Dict[str, float]]:
+        """Computes the metric for every agent over their dialogues.
+
+        Args:
+            dialogues: Dialogues.
+            **kwargs: Additional arguments specific to the metric.
+
+        Returns:
+            Dictionary with result per agent.
+        """
+        dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list)
+        for dialogue in dialogues:
+            dialogues_by_agent[dialogue.agent_id].append(dialogue)
+        return {
+            agent_id: self.evaluate_dialogues(agent_dialogues, **kwargs)
+            for agent_id, agent_dialogues in dialogues_by_agent.items()
+        }
diff --git a/scripts/evaluation/quality_evaluation.py b/scripts/evaluation/quality_evaluation.py
index 162e1d56..082adde3 100644
--- a/scripts/evaluation/quality_evaluation.py
+++ b/scripts/evaluation/quality_evaluation.py
@@ -15,11 +15,10 @@
 import json
 import os
 from statistics import mean, stdev
-from typing import Dict, List
 
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
 
-from scripts.evaluation.quality_metric import QualityMetric, QualityScoreEncoder
+from scripts.evaluation.quality_metric import QualityMetric
 
 
 def parse_args() -> argparse.Namespace:
@@ -56,19 +55,18 @@ def parse_args() -> argparse.Namespace:
     dialogues = json_to_dialogues(args.dialogues)
 
     metric = QualityMetric(args.ollama_config)
-    scores: Dict[str, Dict[str, List]] = metric.compute(dialogues)
+    scores = metric.evaluate_agents(dialogues)
 
-    # Save scores
+    # Save scores (agent_id -> conversation_id -> score)
     if args.output:
-        os.makedirs(os.path.dirname(args.output), exist_ok=True)
+        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
         with open(args.output, "w") as f:
-            json.dump(scores, f, indent=2, cls=QualityScoreEncoder)
+            json.dump(scores, f, indent=2)
 
     # Summary
     for agent_id, agent_scores in scores.items():
+        score_values = list(agent_scores.values())
         print(f"Scores for agent {agent_id}:")
-        for aspect_name, aspect_scores in agent_scores.items():
-            print(f"Aspect: {aspect_name}")
-            avg_score = mean([score.score for score in aspect_scores])
-            std_dev = stdev([score.score for score in aspect_scores])
-            print(f"Average score: {avg_score:.2f} (std dev: {std_dev:.2f})")
+        avg_score = mean(score_values)
+        std_dev = stdev(score_values) if len(score_values) >= 2 else 0.0
+        print(f"Average score: {avg_score:.2f} (std dev: {std_dev:.2f})")
diff --git a/scripts/evaluation/quality_metric.py b/scripts/evaluation/quality_metric.py
index 298f1eb1..44b78ed5 100644
--- a/scripts/evaluation/quality_metric.py
+++ b/scripts/evaluation/quality_metric.py
@@ -3,17 +3,14 @@
 Extracted from the original CLI script in `quality_evaluation.py`.
 """
 
-from collections import defaultdict
 import json
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
-
-from tqdm import tqdm
+from statistics import mean
+from typing import Any, List, Optional
 
 from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.participant.participant import DialogueParticipant
 
-from scripts.evaluation.base_metric import BaseMetric
+from scripts.evaluation.base_metric import Metric
 from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics
 from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface
 
@@ -31,34 +28,10 @@
 )
 
 
-@dataclass
-class QualityScore:
-    conversation_id: str
-    score: int
-    explanation: str = ""
-
-    def to_dict(self) -> Dict[str, Any]:
-        return {
-            "conversation_id": self.conversation_id,
-            "score": self.score,
-            "score_explanation": self.explanation,
-        }
-
-
-class QualityScoreEncoder(json.JSONEncoder):
-    def default(self, o):
-        if isinstance(o, QualityScore):
-            return o.to_dict()
-        return super().default(o)
-
-
-class QualityMetric(BaseMetric):
+class QualityMetric(Metric):
     """Quality evaluation metric using an LLM backend.
 
-    The class wraps the prompt construction and LLM calls and returns the
-    same structure previously produced by the CLI script:
-
-    { agent_id: { aspect_name: [QualityScore, ...], ... }, ... }
+    Returns scores as floats (average across aspects per dialogue).
     """
 
     def __init__(
@@ -66,15 +39,22 @@ def __init__(
         ollama_config_path: str,
         default_response: str = "",
         rubrics: Optional[List[QualityRubrics]] = None,
+        name: str = "quality",
     ) -> None:
-        super().__init__()
+        super().__init__(name)
         self.ollama_config_path = ollama_config_path
         self.default_response = default_response
         self.rubrics = rubrics or list(QualityRubrics)
-
-    @property
-    def name(self) -> str:
-        return "quality"
+        self._ollama_interface: Optional[OllamaLLMInterface] = None
+
+    def _get_ollama_interface(self) -> OllamaLLMInterface:
+        """Returns (cached) Ollama LLM interface."""
+        if self._ollama_interface is None:
+            self._ollama_interface = OllamaLLMInterface(
+                self.ollama_config_path,
+                default_response=self.default_response,
+            )
+        return self._ollama_interface
 
     def _get_prompt(
         self, grading_rubric: QualityRubrics, dialogue: Dialogue
@@ -101,108 +81,28 @@ def _get_prompt(
         prompt += _PROMPT_EVAL_OUTPUT_FORMAT
         return prompt
 
-    def compute(
-        self, dialogues: List[Dialogue], aspects: Optional[List[str]] = None
-    ) -> Dict[str, Dict[str, List[QualityScore]]]:
-        """Compute quality scores for provided dialogues.
-
-        Args:
-            dialogues: list of Dialogue objects
-            aspects: optional list of aspect names (strings) to evaluate
-
-        Returns:
-            Nested dict: agent_id -> aspect_name -> list[QualityScore]
-        """
-        ollama_interface = OllamaLLMInterface(
-            self.ollama_config_path, default_response=self.default_response
-        )
-
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Returns average score across aspects for a single dialogue (1–5)."""
+        aspects = kwargs.get("aspects")
         if aspects:
             aspect_enums = [QualityRubrics[asp] for asp in aspects]
         else:
             aspect_enums = self.rubrics
 
-        scores: Dict[str, Dict[str, List[QualityScore]]] = defaultdict(
-            lambda: defaultdict(list)
-        )
-
-        for dialogue in tqdm(dialogues):
-            for aspect in aspect_enums:
-                prompt = self._get_prompt(aspect, dialogue)
-                response = ollama_interface.get_llm_api_response(prompt)
-                try:
-                    response = response.replace("\\", "\\\\")
-                    response_dict = json.loads(response)
-                    score = QualityScore(
-                        conversation_id=dialogue.conversation_id,
-                        score=int(response_dict["score"]),
-                        explanation=response_dict.get("score_explanation", ""),
-                    )
-                    scores[dialogue.agent_id][aspect.name].append(score)
-                except Exception:
-                    print(
-                        f"Failed to get score for {aspect} dialogue "
-                        f"{dialogue.conversation_id}: {response}"
-                    )
-
-        return scores
-
-
-class RecommendationRelevanceMetric(QualityMetric):
-    """Quality metric that evaluates only recommendation relevance."""
-
-    def __init__(self, ollama_config_path: str, default_response: str = ""):
-        super().__init__(ollama_config_path, default_response=default_response)
-        self.rubrics = [QualityRubrics.REC_RELEVANCE]
-
-    @property
-    def name(self) -> str:
-        return "quality.recommendation_relevance"
-
-
-class CommunicationStyleMetric(QualityMetric):
-    """Quality metric that evaluates communication style."""
-
-    def __init__(self, ollama_config_path: str, default_response: str = ""):
-        super().__init__(ollama_config_path, default_response=default_response)
-        self.rubrics = [QualityRubrics.COM_STYLE]
-
-    @property
-    def name(self) -> str:
-        return "quality.communication_style"
-
-
-class FluencyMetric(QualityMetric):
-    """Quality metric that evaluates fluency."""
-
-    def __init__(self, ollama_config_path: str, default_response: str = ""):
-        super().__init__(ollama_config_path, default_response=default_response)
-        self.rubrics = [QualityRubrics.FLUENCY]
-
-    @property
-    def name(self) -> str:
-        return "quality.fluency"
-
-
-class ConversationalFlowMetric(QualityMetric):
-    """Quality metric that evaluates conversational flow."""
-
-    def __init__(self, ollama_config_path: str, default_response: str = ""):
-        super().__init__(ollama_config_path, default_response=default_response)
-        self.rubrics = [QualityRubrics.CONV_FLOW]
-
-    @property
-    def name(self) -> str:
-        return "quality.conversational_flow"
-
-
-class OverallSatisfactionQualityMetric(QualityMetric):
-    """Quality metric that evaluates overall satisfaction aspect."""
-
-    def __init__(self, ollama_config_path: str, default_response: str = ""):
-        super().__init__(ollama_config_path, default_response=default_response)
-        self.rubrics = [QualityRubrics.OVERALL_SAT]
-
-    @property
-    def name(self) -> str:
-        return "quality.overall_satisfaction"
+        ollama_interface = self._get_ollama_interface()
+        scores: List[float] = []
+
+        for aspect in aspect_enums:
+            prompt = self._get_prompt(aspect, dialogue)
+            response = ollama_interface.get_llm_api_response(prompt)
+            try:
+                response = response.replace("\\", "\\\\")
+                response_dict = json.loads(response)
+                scores.append(int(response_dict["score"]))
+            except Exception:
+                print(
+                    f"Failed to get score for {aspect} dialogue "
+                    f"{dialogue.conversation_id}: {response}"
+                )
+
+        return mean(scores) if scores else 0.0
diff --git a/scripts/evaluation/satisfaction_evaluation.py b/scripts/evaluation/satisfaction_evaluation.py
index 21fb8e00..4c2d1890 100644
--- a/scripts/evaluation/satisfaction_evaluation.py
+++ b/scripts/evaluation/satisfaction_evaluation.py
@@ -5,8 +5,6 @@
 """
 
 import argparse
-from statistics import mean, stdev
-from typing import Dict
 
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
 from scripts.evaluation.satisfaction_metric import SatisfactionMetric
@@ -36,14 +34,14 @@ def parse_args() -> argparse.Namespace:
     print(f"Loaded {len(dialogues)} dialogues.")
 
     metric = SatisfactionMetric()
-    scores: Dict[str, Dict[int, float]] = metric.compute(dialogues)
+    scores = metric.evaluate_agents(dialogues)
 
     # Summary
     for agent, agent_scores in scores.items():
-        avg_score = mean(agent_scores.values())
-        stdev_score = stdev(agent_scores.values())
-        max_score = max(agent_scores.values())
-        min_score = min(agent_scores.values())
+        avg_score = metric.get_average(agent_scores)
+        stdev_score = metric.get_stdev(agent_scores)
+        max_score = metric.get_max(agent_scores)
+        min_score = metric.get_min(agent_scores)
         print(f"Agent: {agent} / Num. dialogues: {len(agent_scores)}")
         print(f"Min score: {min_score}")
         print(f"Max score: {max_score}")
diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py
index 8de9b8b1..e7125696 100644
--- a/scripts/evaluation/satisfaction_metric.py
+++ b/scripts/evaluation/satisfaction_metric.py
@@ -1,73 +1,52 @@
 """Satisfaction metric class implementation.
 
-Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class.
+Wraps DialogueKit's satisfaction classifier into a `Metric` class.
 """
 
-from collections import defaultdict
-from typing import Any, Dict, List, Optional, TYPE_CHECKING
+from statistics import mean, stdev
+from typing import Any, Dict, Optional
 
-if TYPE_CHECKING:
-    from dialoguekit.core.dialogue import Dialogue  # type: ignore
-    from dialoguekit.nlu.models.satisfaction_classifier import (
-        SatisfactionClassifierSVM,
-    )  # type: ignore
-else:
-    try:
-        from dialoguekit.core.dialogue import Dialogue
-        from dialoguekit.nlu.models.satisfaction_classifier import (
-            SatisfactionClassifierSVM,
-        )
-    except Exception:
-        Dialogue = Any
-        SatisfactionClassifierSVM = Any
+from dialoguekit.core.dialogue import Dialogue  # type: ignore
+from dialoguekit.nlu.models.satisfaction_classifier import (
+    SatisfactionClassifierSVM,
+)
 
-from scripts.evaluation.base_metric import BaseMetric
+from scripts.evaluation.base_metric import Metric
 
 
-class SatisfactionMetric(BaseMetric):
-    """Wraps the `SatisfactionClassifierSVM` to compute satisfaction scores.
+class SatisfactionMetric(Metric):
+    """Wraps the `SatisfactionClassifierSVM` to compute satisfaction scores."""
 
-    Output format matches previous CLI script: { agent_id: { dialogue_index:
-    score, ... }, ... }
-    """
-
-    def __init__(self, classifier: Optional[SatisfactionClassifierSVM] = None):
-        super().__init__()
+    def __init__(
+        self,
+        classifier: Optional[SatisfactionClassifierSVM] = None,
+        name: str = "satisfaction",
+    ):
+        super().__init__(name)
         self.classifier = classifier or SatisfactionClassifierSVM()
 
-    @property
-    def name(self) -> str:
-        return "satisfaction"
-
-    def compute(self, dialogues: List[Dialogue]) -> Dict[str, Dict[int, float]]:
-        """Compute satisfaction scores for dialogues.
-
-        Matches the previous CLI output format: agent_id -> dialogue_index ->
-        score
-        """
-        scores: Dict[str, Dict[int, float]] = defaultdict(dict)
-        for i, dialogue in enumerate(dialogues):
-            scores[dialogue.agent_id][
-                i
-            ] = self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
-        return scores
-
-
-class SatisfactionAverageMetric(SatisfactionMetric):
-    """Aggregates satisfaction scores and returns average per agent."""
-
-    @property
-    def name(self) -> str:
-        return "satisfaction.average"
-
-    def compute(self, dialogues: List[Dialogue]) -> Dict[str, float]:
-        raw = super().compute(dialogues)
-        averages: Dict[str, float] = {}
-        for agent_id, agent_scores in raw.items():
-            if len(agent_scores) == 0:
-                averages[agent_id] = 0.0
-            else:
-                averages[agent_id] = sum(agent_scores.values()) / len(
-                    agent_scores
-                )
-        return averages
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the satisfaction score for a single dialogue."""
+        return self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
+
+    @staticmethod
+    def get_average(agent_scores: Dict[str, float]) -> float:
+        """Returns the average score for an agent's dialogues."""
+        return mean(agent_scores.values()) if agent_scores else 0.0
+
+    @staticmethod
+    def get_stdev(agent_scores: Dict[str, float]) -> float:
+        """Returns the standard deviation of scores for an agent's dialogues."""
+        if len(agent_scores) < 2:
+            return 0.0
+        return stdev(agent_scores.values())
+
+    @staticmethod
+    def get_max(agent_scores: Dict[str, float]) -> float:
+        """Returns the maximum score for an agent's dialogues."""
+        return max(agent_scores.values()) if agent_scores else 0.0
+
+    @staticmethod
+    def get_min(agent_scores: Dict[str, float]) -> float:
+        """Returns the minimum score for an agent's dialogues."""
+        return min(agent_scores.values()) if agent_scores else 0.0
diff --git a/scripts/evaluation/utility_evaluation.py b/scripts/evaluation/utility_evaluation.py
index ec898678..4106624c 100644
--- a/scripts/evaluation/utility_evaluation.py
+++ b/scripts/evaluation/utility_evaluation.py
@@ -16,11 +16,48 @@
 
 import argparse
 import json
+from collections import defaultdict
+from typing import Dict
 
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
 from scripts.evaluation.utility_metric import UtilityMetric
 
 
+def get_summary(
+    scores: Dict[str, Dict[str, Dict[str, float]]],
+) -> None:
+    """Displays a summary of the utility evaluation.
+
+    Args:
+        scores: Agent_id -> conversation_id -> utility metrics dict.
+    """
+    summary: dict = defaultdict(
+        lambda: {"total_dialogues": 0, "success_rate": 0, "srrr": 0, "rdl": 0}
+    )
+    for agent_id, agent_scores in scores.items():
+        for conv_metrics in agent_scores.values():
+            summary[agent_id]["total_dialogues"] += 1
+            summary[agent_id]["success_rate"] += conv_metrics["success"]
+            summary[agent_id]["srrr"] += conv_metrics[
+                "successful_recommendation_round_ratio"
+            ]
+            summary[agent_id]["rdl"] += conv_metrics[
+                "reward_per_dialogue_length"
+            ]
+
+    for agent_id, stats in summary.items():
+        total = stats["total_dialogues"]
+        print(f"Agent: {agent_id}")
+        print(f"\tTotal Dialogues: {total}")
+        print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}")
+        print(
+            "\tSuccessful Recommendation Round Ratio: "
+            f"{stats['srrr'] / total:.4f}"
+        )
+        print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}")
+        print()
+
+
 def parse_args() -> argparse.Namespace:
     """Parses command-line arguments.
 
@@ -75,7 +112,7 @@ def parse_args() -> argparse.Namespace:
     dialogues = json_to_dialogues(args.annotated_dialogues)
 
     metric = UtilityMetric(args.user_nlu_config, args.agent_nlu_config)
-    dialogues = metric.compute(
+    scores = metric.evaluate_agents(
         dialogues,
         recommendation_intent_labels=args.recommendation_intent_labels,
         acceptance_intent_labels=args.accept_intent_labels,
@@ -84,8 +121,6 @@ def parse_args() -> argparse.Namespace:
 
     if args.output:
         with open(args.output, "w") as f:
-            json.dump(
-                [dialogue.to_dict() for dialogue in dialogues], f, indent=2
-            )
+            json.dump(scores, f, indent=2)
 
-    metric.get_summary(dialogues)
+    get_summary(scores)
diff --git a/scripts/evaluation/utility_metric.py b/scripts/evaluation/utility_metric.py
index f59d92e6..cf17604e 100644
--- a/scripts/evaluation/utility_metric.py
+++ b/scripts/evaluation/utility_metric.py
@@ -1,10 +1,9 @@
 """Utility metric class implementation.
 
-Encapsulates the logic from `utility_evaluation.py` into a `BaseMetric`.
+Encapsulates the logic from `utility_evaluation.py` into a `Metric`.
 """
 
-from collections import defaultdict
-from typing import Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple, cast
 
 from confuse import Configuration
 
@@ -14,297 +13,284 @@
 from dialoguekit.nlu.nlu import NLU
 from dialoguekit.participant.participant import DialogueParticipant
 from usersimcrs.utils.simulation_utils import get_NLU
-from scripts.evaluation.base_metric import BaseMetric
+from scripts.evaluation.base_metric import Metric
 
 
-def annotate_dialogue(
-    dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
-) -> Dialogue:
-    """Annotates utterances with dialogue acts.
-
-    Args:
-        dialogue: Dialogue to be annotated.
-        user_nlu: User NLU module.
-        agent_nlu: Agent NLU module.
-
-    Returns:
-        Annotated dialogue.
-    """
-    for i, utterance in enumerate(dialogue.utterances):
-        if not isinstance(utterance, AnnotatedUtterance):
-            dialogue.utterances[i] = AnnotatedUtterance.from_utterance(
-                utterance
-            )
-
-        if len(utterance.dialogue_acts) > 0:
-            continue
-
-        if utterance.participant == DialogueParticipant.USER:
-            dialogue.utterances[
-                i
-            ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance)
-        elif utterance.participant == DialogueParticipant.AGENT:
-            dialogue.utterances[
-                i
-            ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance)
-        else:
-            raise ValueError(f"Unknown participant: {utterance.participant}")
-    return dialogue
-
-
-def annotate_dialogues(
-    dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU
-) -> List[Dialogue]:
-    """Annotates dialogues with dialogue acts.
-
-    Args:
-        dialogues: Dialogues.
-        user_nlu: User NLU module.
-        agent_nlu: Agent NLU module.
-
-    Returns:
-        Annotated dialogues.
-    """
-    # TODO: Move this to DialogueKit
-    # See: https://github.com/iai-group/UserSimCRS/issues/219
-    return [
-        annotate_dialogue(dialogue, user_nlu, agent_nlu)
-        for dialogue in dialogues
-    ]
-
-
-def _get_recommendation_rounds(
-    dialogue: Dialogue, recommendation_intents: List[Intent]
-) -> List[List[AnnotatedUtterance]]:
-    rounds: List[List[AnnotatedUtterance]] = []
-    current_round: List[AnnotatedUtterance] = []
-    for utterance in dialogue.utterances:
-        if any(
-            intent in utterance.get_intents()
-            for intent in recommendation_intents
-        ):
-            if current_round:
-                rounds.append(current_round)
-            current_round = [utterance]
-        else:
-            current_round.append(utterance)
-    return rounds
-
-
-def _is_recommendation_accepted(
-    round: List[AnnotatedUtterance],
-    acceptance_intents: List[Intent],
-    rejection_intents: List[Intent],
-) -> bool:
-    b_accepted = False
-    for utterance in round:
-        if utterance.participant == DialogueParticipant.USER:
-            intents = utterance.get_intents()
-            if any(intent in acceptance_intents for intent in intents):
-                b_accepted = True
-            elif any(intent in rejection_intents for intent in intents):
-                return False
-    return b_accepted
-
-
-def assess_dialogue(
-    dialogue: Dialogue,
-    recommendation_intents: List[Intent],
-    acceptance_intents: List[Intent],
-    rejection_intents: List[Intent],
-) -> Tuple[int, int, int]:
-    """Assesses the utility of the dialogue.
-
-    Args:
-        dialogue: Dialogue.
-        recommendation_intents: Intents corresponding to recommendation.
-        acceptance_intents: Intents corresponding to acceptance.
-        rejection_intents: Intents corresponding to rejection.
-
-    Returns:
-        Tuple of number of accepted recommendations, successful recommendation
-          rounds and total recommendation rounds.
-    """
-    # TODO: Optimize overall assessment to avoid multiple iterations over
-    # utterances.
-    rounds = _get_recommendation_rounds(dialogue, recommendation_intents)
-    successful_rounds = 0
-    for round in rounds:
-        if _is_recommendation_accepted(
-            round, acceptance_intents, rejection_intents
-        ):
-            successful_rounds += 1
-
-    nb_accepted_recommendations = sum(
-        1
-        for utterance in dialogue.utterances
-        if utterance.participant == DialogueParticipant.USER
-        and any(
-            intent in acceptance_intents for intent in utterance.get_intents()
-        )
-    )
-    return nb_accepted_recommendations, successful_rounds, len(rounds)
-
-
-class UtilityMetric(BaseMetric):
+class UtilityMetric(Metric):
     """Computes utility metrics for dialogues.
 
     Constructor takes paths to user and agent NLU configuration files.
     """
 
-    def __init__(self, user_nlu_config_path: str, agent_nlu_config_path: str):
-        super().__init__()
+    def __init__(
+        self,
+        user_nlu_config_path: str,
+        agent_nlu_config_path: str,
+        name: str = "utility",
+    ):
+        super().__init__(name)
         self.user_nlu_config_path = user_nlu_config_path
         self.agent_nlu_config_path = agent_nlu_config_path
+        self._user_nlu: Optional[NLU] = None
+        self._agent_nlu: Optional[NLU] = None
+
+    def _annotate_dialogue(
+        self, dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
+    ) -> Dialogue:
+        """Annotates utterances with dialogue acts.
+
+        Args:
+            dialogue: Dialogue to be annotated.
+            user_nlu: User NLU module.
+            agent_nlu: Agent NLU module.
+
+        Returns:
+            Annotated dialogue.
+        """
+        for i, utterance in enumerate(dialogue.utterances):
+            if not isinstance(utterance, AnnotatedUtterance):
+                dialogue.utterances[i] = AnnotatedUtterance.from_utterance(
+                    utterance
+                )
 
-    @property
-    def name(self) -> str:
-        return "utility"
-
-    def _load_nlus(self) -> Tuple[NLU, NLU]:
-        user_nlu_config = Configuration("User NLU Configuration")
-        user_nlu_config.set_file(self.user_nlu_config_path)
-        user_nlu = get_NLU(user_nlu_config)
-
-        agent_nlu_config = Configuration("Agent NLU Configuration")
-        agent_nlu_config.set_file(self.agent_nlu_config_path)
-        agent_nlu = get_NLU(agent_nlu_config)
-
-        return user_nlu, agent_nlu
+            if len(utterance.dialogue_acts) > 0:
+                continue
+
+            if utterance.participant == DialogueParticipant.USER:
+                dialogue.utterances[
+                    i
+                ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance)
+            elif utterance.participant == DialogueParticipant.AGENT:
+                dialogue.utterances[
+                    i
+                ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance)
+            else:
+                raise ValueError(
+                    f"Unknown participant: {utterance.participant}"
+                )
+        return dialogue
 
-    def compute(
-        self,
-        dialogues: List[Dialogue],
-        recommendation_intent_labels: List[str] = ["REC-S", "REC-E"],
-        acceptance_intent_labels: List[str] = ["ACC"],
-        rejection_intent_labels: List[str] = ["REJ"],
+    def _annotate_dialogues(
+        self, dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU
     ) -> List[Dialogue]:
-        user_nlu, agent_nlu = self._load_nlus()
-
-        dialogues = annotate_dialogues(dialogues, user_nlu, agent_nlu)
-
-        recommendation_intents = [
-            Intent(label) for label in recommendation_intent_labels
+        """Annotates dialogues with dialogue acts.
+
+        Args:
+            dialogues: Dialogues.
+            user_nlu: User NLU module.
+            agent_nlu: Agent NLU module.
+
+        Returns:
+            Annotated dialogues.
+        """
+        # TODO: Move this to DialogueKit
+        # See: https://github.com/iai-group/UserSimCRS/issues/219
+        return [
+            self._annotate_dialogue(dialogue, user_nlu, agent_nlu)
+            for dialogue in dialogues
         ]
-        acceptance_intents = [
-            Intent(label) for label in acceptance_intent_labels
-        ]
-        rejection_intents = [Intent(label) for label in rejection_intent_labels]
-
-        for dialogue in dialogues:
-            (
-                nb_accepted_recommendations,
-                successful_rounds,
-                total_rounds,
-            ) = assess_dialogue(
-                dialogue,
-                recommendation_intents,
-                acceptance_intents,
-                rejection_intents,
-            )
-            dialogue.metadata["utility"] = {
-                "success": int(successful_rounds > 0),
-                "successful_recommendation_round_ratio": (
-                    successful_rounds / total_rounds
-                    if total_rounds > 0
-                    else 0.0
-                ),
-                "reward_per_dialogue_length": (
-                    nb_accepted_recommendations / len(dialogue.utterances)
-                    if len(dialogue.utterances) > 0
-                    else 0.0
-                ),
-            }
-
-        return dialogues
-
-    def get_summary(self, dialogues: List[Dialogue]) -> None:
-        summary: Dict[str, Dict[str, float]] = defaultdict(
-            lambda: {
-                "total_dialogues": 0,
-                "success_rate": 0,
-                "srrr": 0,
-                "rdl": 0,
-            }
-        )
-        for dialogue in dialogues:
-            summary[dialogue.agent_id]["total_dialogues"] += 1
-            summary[dialogue.agent_id]["success_rate"] += dialogue.metadata[
-                "utility"
-            ]["success"]
-            summary[dialogue.agent_id]["srrr"] += dialogue.metadata["utility"][
-                "successful_recommendation_round_ratio"
-            ]
-            summary[dialogue.agent_id]["rdl"] += dialogue.metadata["utility"][
-                "reward_per_dialogue_length"
-            ]
-
-        for agent_id, stats in summary.items():
-            total = stats["total_dialogues"]
-            print(f"Agent: {agent_id}")
-            print(f"\tTotal Dialogues: {total}")
-            print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}")
-            print(
-                "\tSuccessful Recommendation Round Ratio: "
-                f"{stats['srrr'] / total:.4f}"
-            )
-            print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}")
-            print()
-
-
-class UtilitySuccessMetric(UtilityMetric):
-    """Extracts per-dialogue success flag from utility analysis."""
 
-    @property
-    def name(self) -> str:
-        return "utility.success"
-
-    def compute(self, dialogues: List[Dialogue], *args, **kwargs):
-        dialogues = super().compute(dialogues, *args, **kwargs)
-
-        results: Dict[str, Dict[int, int]] = defaultdict(dict)
-        for i, dialogue in enumerate(dialogues):
-            results[dialogue.agent_id][i] = int(
-                dialogue.metadata.get("utility", {}).get("success", 0)
+    def _get_recommendation_rounds(
+        self, dialogue: Dialogue, recommendation_intents: List[Intent]
+    ) -> List[List[AnnotatedUtterance]]:
+        """Gets utterances per recommendation round.
+
+        Args:
+            dialogue: Dialogue.
+            recommendation_intents: Intents corresponding to recommendation.
+
+        Returns:
+            Utterances per recommendation round.
+        """
+        rounds: List[List[AnnotatedUtterance]] = []
+        current_round: List[AnnotatedUtterance] = []
+        for utterance in dialogue.utterances:
+            if any(
+                intent in utterance.get_intents()
+                for intent in recommendation_intents
+            ):
+                if current_round:
+                    rounds.append(current_round)
+                current_round = [utterance]
+            else:
+                current_round.append(utterance)
+        return rounds
+
+    def _is_recommendation_accepted(
+        self,
+        round: List[AnnotatedUtterance],
+        acceptance_intents: List[Intent],
+        rejection_intents: List[Intent],
+    ) -> bool:
+        """Assesses whether the recommendation was accepted.
+
+        Args:
+            round: Utterances in recommendation round.
+            acceptance_intents: Intents corresponding to acceptance.
+            rejection_intents: Intents corresponding to rejection.
+
+        Returns:
+            True if the recommendation was accepted, False otherwise.
+        """
+        b_accepted = False
+        for utterance in round:
+            if utterance.participant == DialogueParticipant.USER:
+                intents = utterance.get_intents()
+                if any(intent in acceptance_intents for intent in intents):
+                    b_accepted = True
+                elif any(intent in rejection_intents for intent in intents):
+                    return False
+        return b_accepted
+
+    def _assess_dialogue(
+        self,
+        dialogue: Dialogue,
+        recommendation_intents: List[Intent],
+        acceptance_intents: List[Intent],
+        rejection_intents: List[Intent],
+    ) -> Tuple[int, int, int]:
+        """Assesses the utility of the dialogue.
+
+        Args:
+            dialogue: Dialogue.
+            recommendation_intents: Intents corresponding to recommendation.
+            acceptance_intents: Intents corresponding to acceptance.
+            rejection_intents: Intents corresponding to rejection.
+
+        Returns:
+            Tuple of number of accepted recommendations, successful
+                recommendation rounds and total recommendation rounds.
+        """
+        # TODO: Optimize overall assessment to avoid multiple iterations over
+        # utterances.
+        rounds = self._get_recommendation_rounds(
+            dialogue, recommendation_intents
+        )
+        successful_rounds = 0
+        for round in rounds:
+            if self._is_recommendation_accepted(
+                round, acceptance_intents, rejection_intents
+            ):
+                successful_rounds += 1
+
+        nb_accepted_recommendations = sum(
+            1
+            for utterance in dialogue.utterances
+            if utterance.participant == DialogueParticipant.USER
+            and any(
+                intent in acceptance_intents
+                for intent in utterance.get_intents()
             )
-        return results
-
-
-class UtilitySRRRMetric(UtilityMetric):
-    """Extracts successful recommendation round ratio per dialogue."""
-
-    @property
-    def name(self) -> str:
-        return "utility.successful_recommendation_round_ratio"
+        )
+        return nb_accepted_recommendations, successful_rounds, len(rounds)
 
-    def compute(self, dialogues: List[Dialogue], *args, **kwargs):
-        dialogues = super().compute(dialogues, *args, **kwargs)
+    def _load_nlus(self) -> Tuple[NLU, NLU]:
+        """Returns (cached) user and agent NLU modules."""
+        if self._user_nlu is None:
+            # NLU module for user utterances
+            user_nlu_config = Configuration("User NLU Configuration")
+            user_nlu_config.set_file(self.user_nlu_config_path)
+            self._user_nlu = get_NLU(user_nlu_config)
+        if self._agent_nlu is None:
+            # NLU module for agent utterances
+            agent_nlu_config = Configuration("Agent NLU Configuration")
+            agent_nlu_config.set_file(self.agent_nlu_config_path)
+            self._agent_nlu = get_NLU(agent_nlu_config)
+        return self._user_nlu, self._agent_nlu
+
+    def _get_intent_lists(self, **kwargs: Any) -> Tuple[List[Intent], ...]:
+        """Builds intent lists from kwargs."""
+        rec_labels = kwargs.get(
+            "recommendation_intent_labels", ["REC-S", "REC-E"]
+        )
+        acc_labels = kwargs.get("acceptance_intent_labels", ["ACC"])
+        rej_labels = kwargs.get("rejection_intent_labels", ["REJ"])
+        return (
+            [Intent(label) for label in rec_labels],
+            [Intent(label) for label in acc_labels],
+            [Intent(label) for label in rej_labels],
+        )
 
-        results: Dict[str, Dict[int, float]] = defaultdict(dict)
-        for i, dialogue in enumerate(dialogues):
-            results[dialogue.agent_id][i] = float(
-                dialogue.metadata.get("utility", {}).get(
-                    "successful_recommendation_round_ratio", 0.0
-                )
+    def evaluate_dialogues(
+        self, dialogues: List[Dialogue], **kwargs: Any
+    ) -> Dict[str, Dict[str, float]]:
+        """Computes all utility metrics for every dialogue.
+
+        Overrides base to return full metrics dict per dialogue rather than
+        a single float, since utility evaluation aggregates SR, SRRR, and RDL.
+
+        Returns:
+            conversation_id -> metrics dict with keys: success,
+            successful_recommendation_round_ratio, reward_per_dialogue_length.
+        """
+        return {
+            dialogue.conversation_id: self._get_utility_metrics(
+                dialogue, **kwargs
             )
-        return results
-
-
-class UtilityRDLMetric(UtilityMetric):
-    """Extracts reward-per-dialogue-length per dialogue."""
-
-    @property
-    def name(self) -> str:
-        return "utility.reward_per_dialogue_length"
-
-    def compute(self, dialogues: List[Dialogue], *args, **kwargs):
-        dialogues = super().compute(dialogues, *args, **kwargs)
-
-        results: Dict[str, Dict[int, float]] = defaultdict(dict)
-        for i, dialogue in enumerate(dialogues):
-            results[dialogue.agent_id][i] = float(
-                dialogue.metadata.get("utility", {}).get(
-                    "reward_per_dialogue_length", 0.0
-                )
+            for dialogue in dialogues
+        }
+
+    def evaluate_agents(
+        self, dialogues: List[Dialogue], **kwargs: Any
+    ) -> Dict[str, Dict[str, Dict[str, float]]]:
+        """Computes utility metrics per agent, returning full metrics per
+        dialogue.
+
+        Returns:
+            agent_id -> conversation_id -> metrics dict (success, srrr, rdl).
+        """
+        result = super().evaluate_agents(dialogues, **kwargs)
+        return cast(Dict[str, Dict[str, Dict[str, float]]], result)
+
+    def _get_utility_metrics(
+        self, dialogue: Dialogue, **kwargs: Any
+    ) -> Dict[str, float]:
+        """Returns full utility dict for one dialogue."""
+        user_nlu, agent_nlu = self._load_nlus()
+        self._annotate_dialogue(dialogue, user_nlu, agent_nlu)
+        (
+            recommendation_intents,
+            acceptance_intents,
+            rejection_intents,
+        ) = self._get_intent_lists(**kwargs)
+        (
+            nb_accepted_recommendations,
+            successful_rounds,
+            total_rounds,
+        ) = self._assess_dialogue(
+            dialogue,
+            recommendation_intents,
+            acceptance_intents,
+            rejection_intents,
+        )
+        return {
+            "success": float(successful_rounds > 0),
+            "successful_recommendation_round_ratio": (
+                successful_rounds / total_rounds if total_rounds > 0 else 0.0
+            ),
+            "reward_per_dialogue_length": (
+                nb_accepted_recommendations / len(dialogue.utterances)
+                if dialogue.utterances
+                else 0.0
+            ),
+        }
+
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes one utility metric for a single dialogue.
+
+        Args:
+            dialogue: Dialogue to evaluate.
+            metric: One of "success", "successful_recommendation_round_ratio",
+                "reward_per_dialogue_length". Default "success".
+
+        Returns:
+            The selected metric value as float.
+        """
+        metrics = self._get_utility_metrics(dialogue, **kwargs)
+        metric = kwargs.get("metric", "success")
+        if metric not in metrics:
+            raise ValueError(
+                f"Unknown metric '{metric}'. "
+                f"Expected one of {list(metrics.keys())}"
             )
-        return results
+        return metrics[metric]
diff --git a/tests/evaluation/test_quality_metric.py b/tests/evaluation/test_quality_metric.py
new file mode 100644
index 00000000..2c409806
--- /dev/null
+++ b/tests/evaluation/test_quality_metric.py
@@ -0,0 +1,83 @@
+"""Tests for QualityMetric."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from dialoguekit.utils.dialogue_reader import json_to_dialogues
+
+from scripts.evaluation.quality_metric import QualityMetric
+
+
+@pytest.fixture
+def dialogues():
+    """Load test dialogues."""
+    return json_to_dialogues(
+        "tests/data/annotated_dialogues.json",
+        agent_ids=["Agent"],
+        user_ids=["User"],
+    )
+
+
+@pytest.fixture
+def mock_ollama():
+    """Mock Ollama LLM interface that returns fixed score JSON."""
+    interface = MagicMock()
+    interface.get_llm_api_response.return_value = (
+        '{"score": 4, "score_explanation": "good"}'
+    )
+    return interface
+
+
+@pytest.fixture
+def metric(mock_ollama):
+    """QualityMetric with mocked Ollama interface."""
+    with patch.object(
+        QualityMetric, "_get_ollama_interface", return_value=mock_ollama
+    ):
+        yield QualityMetric(ollama_config_path="dummy_config.json")
+
+
+def test_evaluate_dialogue(
+    metric: QualityMetric, mock_ollama, dialogues
+) -> None:
+    """Test evaluate_dialogue returns mean of aspect scores for a dialogue."""
+    dialogue = dialogues[0]
+    score = metric.evaluate_dialogue(dialogue)
+    assert score == 4.0
+    assert mock_ollama.get_llm_api_response.call_count == len(metric.rubrics)
+
+
+def test_evaluate_dialogue_with_aspects(
+    metric: QualityMetric, mock_ollama, dialogues
+) -> None:
+    """Test evaluate_dialogue with aspects kwarg calls LLM only for aspects."""
+    dialogue = dialogues[0]
+    aspects = ["REC_RELEVANCE", "FLUENCY"]
+    score = metric.evaluate_dialogue(dialogue, aspects=aspects)
+    assert score == 4.0
+    assert mock_ollama.get_llm_api_response.call_count == 2
+
+
+def test_evaluate_dialogues(
+    metric: QualityMetric, mock_ollama, dialogues
+) -> None:
+    """Test evaluate_dialogues returns conversation_id -> score."""
+    result = metric.evaluate_dialogues(dialogues)
+    assert len(result) == len(dialogues)
+    for dialogue in dialogues:
+        assert dialogue.conversation_id in result
+        assert result[dialogue.conversation_id] == 4.0
+    expected_calls = len(dialogues) * len(metric.rubrics)
+    assert mock_ollama.get_llm_api_response.call_count == expected_calls
+
+
+def test_evaluate_agents(metric: QualityMetric, dialogues) -> None:
+    """Test evaluate_agents returns agent_id -> {conversation_id -> score}."""
+    result = metric.evaluate_agents(dialogues)
+    assert "Agent" in result
+    agent_scores = result["Agent"]
+    assert len(agent_scores) == len(dialogues)
+    for dialogue in dialogues:
+        assert dialogue.conversation_id in agent_scores
+        assert agent_scores[dialogue.conversation_id] == 4.0
diff --git a/tests/evaluation/test_satisfaction_metric.py b/tests/evaluation/test_satisfaction_metric.py
new file mode 100644
index 00000000..71231505
--- /dev/null
+++ b/tests/evaluation/test_satisfaction_metric.py
@@ -0,0 +1,68 @@
+"""Tests for SatisfactionMetric."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from dialoguekit.utils.dialogue_reader import json_to_dialogues
+
+from scripts.evaluation.satisfaction_metric import SatisfactionMetric
+
+
+@pytest.fixture
+def dialogues():
+    """Load test dialogues."""
+    return json_to_dialogues(
+        "tests/data/annotated_dialogues.json",
+        agent_ids=["Agent"],
+        user_ids=["User"],
+    )
+
+
+@pytest.fixture
+def mock_classifier():
+    """Mock satisfaction classifier that returns fixed scores."""
+    classifier = MagicMock()
+    classifier.classify_last_n_dialogue = MagicMock(return_value=3.5)
+    return classifier
+
+
+@pytest.fixture
+def metric(mock_classifier):
+    """SatisfactionMetric with mocked classifier."""
+    return SatisfactionMetric(classifier=mock_classifier)
+
+
+def test_evaluate_dialogue(
+    metric: SatisfactionMetric, mock_classifier, dialogues
+) -> None:
+    """Test evaluate_dialogue returns classifier score for a single dialogue."""
+    dialogue = dialogues[0]
+    score = metric.evaluate_dialogue(dialogue)
+    assert score == 3.5
+    mock_classifier.classify_last_n_dialogue.assert_called_once_with(
+        dialogue, last_n=None
+    )
+
+
+def test_evaluate_dialogues(
+    metric: SatisfactionMetric, mock_classifier, dialogues
+) -> None:
+    """Test evaluate_dialogues returns conversation_id -> score."""
+    result = metric.evaluate_dialogues(dialogues)
+    assert len(result) == len(dialogues)
+    for dialogue in dialogues:
+        assert dialogue.conversation_id in result
+        assert result[dialogue.conversation_id] == 3.5
+    assert mock_classifier.classify_last_n_dialogue.call_count == len(dialogues)
+
+
+def test_evaluate_agents(metric: SatisfactionMetric, dialogues) -> None:
+    """Test evaluate_agents returns agent_id -> {conversation_id -> score}."""
+    result = metric.evaluate_agents(dialogues)
+    assert "Agent" in result
+    agent_scores = result["Agent"]
+    assert len(agent_scores) == len(dialogues)
+    for dialogue in dialogues:
+        assert dialogue.conversation_id in agent_scores
+        assert agent_scores[dialogue.conversation_id] == 3.5
diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py
new file mode 100644
index 00000000..e12b13dc
--- /dev/null
+++ b/tests/evaluation/test_utility_metric.py
@@ -0,0 +1,77 @@
+"""Tests for UtilityMetric."""
+
+from unittest.mock import patch
+
+import pytest
+
+from dialoguekit.utils.dialogue_reader import json_to_dialogues
+
+from scripts.evaluation.utility_metric import UtilityMetric
+
+
+@pytest.fixture
+def dialogues():
+    """Load test dialogues."""
+    return json_to_dialogues(
+        "tests/data/annotated_dialogues.json",
+        agent_ids=["Agent"],
+        user_ids=["User"],
+    )
+
+
+FIXED_UTILITY = {
+    "success": 1.0,
+    "successful_recommendation_round_ratio": 0.5,
+    "reward_per_dialogue_length": 0.1,
+}
+
+
+@pytest.fixture
+def metric(dialogues):
+    """UtilityMetric returning fixed metrics."""
+    with patch.object(
+        UtilityMetric, "_get_utility_metrics", return_value=FIXED_UTILITY
+    ):
+        yield UtilityMetric(
+            user_nlu_config_path="dummy_user_nlu.yaml",
+            agent_nlu_config_path="dummy_agent_nlu.yaml",
+        )
+
+
+def test_evaluate_dialogue(metric: UtilityMetric, dialogues) -> None:
+    """Test evaluate_dialogue returns selected metric as float."""
+    dialogue = dialogues[0]
+    assert metric.evaluate_dialogue(dialogue) == 1.0
+    assert metric.evaluate_dialogue(dialogue, metric="success") == 1.0
+    assert (
+        metric.evaluate_dialogue(
+            dialogue, metric="successful_recommendation_round_ratio"
+        )
+        == 0.5
+    )
+    assert (
+        metric.evaluate_dialogue(dialogue, metric="reward_per_dialogue_length")
+        == 0.1
+    )
+
+
+def test_evaluate_dialogues(metric: UtilityMetric, dialogues) -> None:
+    """Test evaluate_dialogues returns conversation_id -> full metrics dict."""
+    result = metric.evaluate_dialogues(dialogues)
+    assert len(result) == len(dialogues)
+    for dialogue in dialogues:
+        assert dialogue.conversation_id in result
+        assert result[dialogue.conversation_id] == FIXED_UTILITY
+
+
+def test_evaluate_agents(metric: UtilityMetric, dialogues) -> None:
+    """Test evaluate_agents returns agent_id -> {conversation_id -> metrics
+    dict}."""
+    result = metric.evaluate_agents(dialogues)
+    assert "Agent" in result
+    agent_scores = result["Agent"]
+    assert len(agent_scores) == len(dialogues)
+    for dialogue in dialogues:
+        assert dialogue.conversation_id in agent_scores
+        conv_metrics = agent_scores[dialogue.conversation_id]
+        assert conv_metrics == FIXED_UTILITY

From 800f8c0c22d307506dea8fcecbb8d16a3f6a5ec1 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 11:37:04 +0100
Subject: [PATCH 10/38] improvement/232-create-abstract-class-for-metric fixes

---
 scripts/evaluation/base_metric.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
index f08a3a05..e43e9fc6 100644
--- a/scripts/evaluation/base_metric.py
+++ b/scripts/evaluation/base_metric.py
@@ -6,7 +6,7 @@
 from dialoguekit.core.dialogue import Dialogue
 
 
-class Metric(ABC):
+class BaseMetric(ABC):
     def __init__(self, name: str) -> None:
         """Initializes the metric.
 
@@ -41,7 +41,7 @@ def evaluate_dialogues(
             **kwargs: Additional arguments specific to the metric.
 
         Returns:
-            Dictionary with result per dialogue.
+            Dictionary with result per dialogue. Keys are conversation IDs.
         """
         return {
             dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs)
@@ -58,7 +58,8 @@ def evaluate_agents(
             **kwargs: Additional arguments specific to the metric.
 
         Returns:
-            Dictionary with result per agent.
+            Dictionary with result per agent. Outer keys are agent IDs;
+            inner dict keys are conversation IDs.
         """
         dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list)
         for dialogue in dialogues:

From 2014e222c5ef3e43ffa54082fdbd9cb245f2a15a Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 11:45:32 +0100
Subject: [PATCH 11/38] improvement/232-create-abstract-class-for-metric fixes

---
 scripts/evaluation/base_metric.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
index e43e9fc6..9141972b 100644
--- a/scripts/evaluation/base_metric.py
+++ b/scripts/evaluation/base_metric.py
@@ -58,8 +58,8 @@ def evaluate_agents(
             **kwargs: Additional arguments specific to the metric.
 
         Returns:
-            Dictionary with result per agent. Outer keys are agent IDs;
-            inner dict keys are conversation IDs.
+            Dictionary with result per agent.
+            Keys are agent IDs and conversation IDs.
         """
         dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list)
         for dialogue in dialogues:

From 363551c268a28acc5cb2235cc24bbc223a33fc78 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 12:16:34 +0100
Subject: [PATCH 12/38] improvement/232-create-abstract-class-for-metric remove
 agent evaluation

---
 scripts/evaluation/base_metric.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
index 9141972b..c99399a2 100644
--- a/scripts/evaluation/base_metric.py
+++ b/scripts/evaluation/base_metric.py
@@ -1,7 +1,6 @@
 """Abstract base class for dialogue evaluation metrics."""
 
 from abc import ABC, abstractmethod
-from collections import defaultdict
 from typing import Any, Dict, List
 from dialoguekit.core.dialogue import Dialogue
 
@@ -47,24 +46,3 @@ def evaluate_dialogues(
             dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs)
             for dialogue in dialogues
         }
-
-    def evaluate_agents(
-        self, dialogues: List[Dialogue], **kwargs: Any
-    ) -> Dict[str, Dict[str, float]]:
-        """Computes the metric for every agent over their dialogues.
-
-        Args:
-            dialogues: Dialogues.
-            **kwargs: Additional arguments specific to the metric.
-
-        Returns:
-            Dictionary with result per agent.
-            Keys are agent IDs and conversation IDs.
-        """
-        dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list)
-        for dialogue in dialogues:
-            dialogues_by_agent[dialogue.agent_id].append(dialogue)
-        return {
-            agent_id: self.evaluate_dialogues(agent_dialogues, **kwargs)
-            for agent_id, agent_dialogues in dialogues_by_agent.items()
-        }

From a253f74dc053e0ea257443b8f8883a6f20dfb6bb Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 14:21:51 +0100
Subject: [PATCH 13/38] improvement/233-create-classes-for-metrics edit classes

---
 scripts/evaluation/base_metric.py            |  25 +----
 scripts/evaluation/quality_metric.py         | 108 ++++++++++++-------
 scripts/evaluation/satisfaction_metric.py    |  50 ++++-----
 scripts/evaluation/utility_metric.py         |  90 +++++++++-------
 tests/evaluation/test_quality_metric.py      |  39 ++-----
 tests/evaluation/test_satisfaction_metric.py |  28 +----
 tests/evaluation/test_utility_metric.py      |  26 +++--
 7 files changed, 175 insertions(+), 191 deletions(-)

diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
index f08a3a05..c99399a2 100644
--- a/scripts/evaluation/base_metric.py
+++ b/scripts/evaluation/base_metric.py
@@ -1,12 +1,11 @@
 """Abstract base class for dialogue evaluation metrics."""
 
 from abc import ABC, abstractmethod
-from collections import defaultdict
 from typing import Any, Dict, List
 from dialoguekit.core.dialogue import Dialogue
 
 
-class Metric(ABC):
+class BaseMetric(ABC):
     def __init__(self, name: str) -> None:
         """Initializes the metric.
 
@@ -41,29 +40,9 @@ def evaluate_dialogues(
             **kwargs: Additional arguments specific to the metric.
 
         Returns:
-            Dictionary with result per dialogue.
+            Dictionary with result per dialogue. Keys are conversation IDs.
         """
         return {
             dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs)
             for dialogue in dialogues
         }
-
-    def evaluate_agents(
-        self, dialogues: List[Dialogue], **kwargs: Any
-    ) -> Dict[str, Dict[str, float]]:
-        """Computes the metric for every agent over their dialogues.
-
-        Args:
-            dialogues: Dialogues.
-            **kwargs: Additional arguments specific to the metric.
-
-        Returns:
-            Dictionary with result per agent.
-        """
-        dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list)
-        for dialogue in dialogues:
-            dialogues_by_agent[dialogue.agent_id].append(dialogue)
-        return {
-            agent_id: self.evaluate_dialogues(agent_dialogues, **kwargs)
-            for agent_id, agent_dialogues in dialogues_by_agent.items()
-        }
diff --git a/scripts/evaluation/quality_metric.py b/scripts/evaluation/quality_metric.py
index 44b78ed5..a6e0475f 100644
--- a/scripts/evaluation/quality_metric.py
+++ b/scripts/evaluation/quality_metric.py
@@ -1,16 +1,27 @@
-"""Quality metric class implementation.
+"""Script to evaluate dialogue quality using an LLM.
 
-Extracted from the original CLI script in `quality_evaluation.py`.
+The script evaluates dialogue quality with regards to five aspects:
+- Recommendation relevance
+- Communication style
+- Fluency
+- Conversational flow
+- Overall satisfaction
+
+Each aspect is scored between 1 and 5, where the scores are described in a
+dedicated rubric. The scoring is done using a large language model.
 """
 
+import argparse
 import json
-from statistics import mean
-from typing import Any, List, Optional
+from typing import Any, Optional, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    pass
 
 from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.participant.participant import DialogueParticipant
 
-from scripts.evaluation.base_metric import Metric
+from scripts.evaluation.base_metric import BaseMetric
 from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics
 from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface
 
@@ -28,27 +39,47 @@
 )
 
 
-class QualityMetric(Metric):
-    """Quality evaluation metric using an LLM backend.
-
-    Returns scores as floats (average across aspects per dialogue).
-    """
-
+class QualityMetric(BaseMetric):
     def __init__(
         self,
         ollama_config_path: str,
         default_response: str = "",
-        rubrics: Optional[List[QualityRubrics]] = None,
         name: str = "quality",
     ) -> None:
         super().__init__(name)
         self.ollama_config_path = ollama_config_path
         self.default_response = default_response
-        self.rubrics = rubrics or list(QualityRubrics)
         self._ollama_interface: Optional[OllamaLLMInterface] = None
 
+    @staticmethod
+    def parse_args() -> argparse.Namespace:
+        """Parse command-line arguments.
+
+        Returns:
+            Parsed arguments.
+        """
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            "--dialogues",
+            type=str,
+            required=True,
+            help="Path to the dialogues.",
+        )
+        parser.add_argument(
+            "--ollama_config",
+            type=str,
+            required=True,
+            help="Path to the Ollama config file.",
+        )
+        parser.add_argument(
+            "--output",
+            type=str,
+            help="(optional) Path to the output file.",
+        )
+        return parser.parse_args()
+
     def _get_ollama_interface(self) -> OllamaLLMInterface:
-        """Returns (cached) Ollama LLM interface."""
+        """Returns Ollama LLM interface."""
         if self._ollama_interface is None:
             self._ollama_interface = OllamaLLMInterface(
                 self.ollama_config_path,
@@ -81,28 +112,31 @@ def _get_prompt(
         prompt += _PROMPT_EVAL_OUTPUT_FORMAT
         return prompt
 
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
-        """Returns average score across aspects for a single dialogue (1–5)."""
-        aspects = kwargs.get("aspects")
-        if aspects:
-            aspect_enums = [QualityRubrics[asp] for asp in aspects]
-        else:
-            aspect_enums = self.rubrics
+    def evaluate_dialogue(
+        self, dialogue: Dialogue, aspect: str, **kwargs: Any
+    ) -> float:
+        """Returns score for a single aspect of a dialogue.
+
+        Args:
+            dialogue: Dialogue to evaluate.
+            aspect: Aspect to evaluate. Must be one of QualityRubrics enum names
+
+        Returns:
+            Score (1-5) for the specified aspect.
 
+        Raises:
+            ValueError: When the LLM response cannot be parsed.
+        """
+        aspect_enum = QualityRubrics[aspect]
         ollama_interface = self._get_ollama_interface()
-        scores: List[float] = []
-
-        for aspect in aspect_enums:
-            prompt = self._get_prompt(aspect, dialogue)
-            response = ollama_interface.get_llm_api_response(prompt)
-            try:
-                response = response.replace("\\", "\\\\")
-                response_dict = json.loads(response)
-                scores.append(int(response_dict["score"]))
-            except Exception:
-                print(
-                    f"Failed to get score for {aspect} dialogue "
-                    f"{dialogue.conversation_id}: {response}"
-                )
-
-        return mean(scores) if scores else 0.0
+        prompt = self._get_prompt(aspect_enum, dialogue)
+        response = ollama_interface.get_llm_api_response(prompt)
+        try:
+            response = response.replace("\\", "\\\\")
+            response_dict = json.loads(response)
+            return float(response_dict["score"])
+        except Exception as e:
+            raise ValueError(
+                f"Failed to get score for {aspect} dialogue "
+                f"{dialogue.conversation_id}: {response}"
+            ) from e
diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py
index e7125696..ac5915e0 100644
--- a/scripts/evaluation/satisfaction_metric.py
+++ b/scripts/evaluation/satisfaction_metric.py
@@ -1,22 +1,20 @@
 """Satisfaction metric class implementation.
 
-Wraps DialogueKit's satisfaction classifier into a `Metric` class.
+Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class.
 """
 
-from statistics import mean, stdev
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
-from dialoguekit.core.dialogue import Dialogue  # type: ignore
+from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.nlu.models.satisfaction_classifier import (
     SatisfactionClassifierSVM,
 )
+import argparse
 
-from scripts.evaluation.base_metric import Metric
+from scripts.evaluation.base_metric import BaseMetric
 
 
-class SatisfactionMetric(Metric):
-    """Wraps the `SatisfactionClassifierSVM` to compute satisfaction scores."""
-
+class SatisfactionMetric(BaseMetric):
     def __init__(
         self,
         classifier: Optional[SatisfactionClassifierSVM] = None,
@@ -29,24 +27,18 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
         """Computes the satisfaction score for a single dialogue."""
         return self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
 
-    @staticmethod
-    def get_average(agent_scores: Dict[str, float]) -> float:
-        """Returns the average score for an agent's dialogues."""
-        return mean(agent_scores.values()) if agent_scores else 0.0
-
-    @staticmethod
-    def get_stdev(agent_scores: Dict[str, float]) -> float:
-        """Returns the standard deviation of scores for an agent's dialogues."""
-        if len(agent_scores) < 2:
-            return 0.0
-        return stdev(agent_scores.values())
-
-    @staticmethod
-    def get_max(agent_scores: Dict[str, float]) -> float:
-        """Returns the maximum score for an agent's dialogues."""
-        return max(agent_scores.values()) if agent_scores else 0.0
-
-    @staticmethod
-    def get_min(agent_scores: Dict[str, float]) -> float:
-        """Returns the minimum score for an agent's dialogues."""
-        return min(agent_scores.values()) if agent_scores else 0.0
+    @classmethod
+    def parse_args(self) -> argparse.Namespace:
+        """Parses command-line arguments.
+
+        Returns:
+            Parsed arguments.
+        """
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            "--dialogues",
+            type=str,
+            required=True,
+            help="Path to the dialogues.",
+        )
+        return parser.parse_args()
diff --git a/scripts/evaluation/utility_metric.py b/scripts/evaluation/utility_metric.py
index cf17604e..88617683 100644
--- a/scripts/evaluation/utility_metric.py
+++ b/scripts/evaluation/utility_metric.py
@@ -1,11 +1,12 @@
 """Utility metric class implementation.
 
-Encapsulates the logic from `utility_evaluation.py` into a `Metric`.
+Encapsulates the logic from `utility_evaluation.py` into a `BaseMetric`.
 """
 
-from typing import Any, Dict, List, Optional, Tuple, cast
+from typing import Any, Dict, List, Optional, Tuple
 
 from confuse import Configuration
+import argparse
 
 from dialoguekit.core.annotated_utterance import AnnotatedUtterance
 from dialoguekit.core.dialogue import Dialogue
@@ -13,10 +14,10 @@
 from dialoguekit.nlu.nlu import NLU
 from dialoguekit.participant.participant import DialogueParticipant
 from usersimcrs.utils.simulation_utils import get_NLU
-from scripts.evaluation.base_metric import Metric
+from scripts.evaluation.base_metric import BaseMetric
 
 
-class UtilityMetric(Metric):
+class UtilityMetric(BaseMetric):
     """Computes utility metrics for dialogues.
 
     Constructor takes paths to user and agent NLU configuration files.
@@ -34,6 +35,54 @@ def __init__(
         self._user_nlu: Optional[NLU] = None
         self._agent_nlu: Optional[NLU] = None
 
+    @classmethod
+    def parse_args(self) -> argparse.Namespace:
+        """Parses command-line arguments.
+
+        Returns:
+            Parsed command-line arguments.
+        """
+        parser = argparse.ArgumentParser(prog="utility_evaluation.py")
+        parser.add_argument(
+            "annotated_dialogues",
+            type=str,
+            help="Annotated dialogues JSON file.",
+        )
+        parser.add_argument(
+            "user_nlu_config",
+            type=str,
+            help="User NLU configuration file.",
+        )
+        parser.add_argument(
+            "agent_nlu_config",
+            type=str,
+            help="Agent NLU configuration file.",
+        )
+        parser.add_argument(
+            "--reject_intent_labels",
+            nargs="+",
+            default=["REJ"],
+            help="Intent labels corresponding to rejection.",
+        )
+        parser.add_argument(
+            "--accept_intent_labels",
+            nargs="+",
+            default=["ACC"],
+            help="Intent labels corresponding to acceptance.",
+        )
+        parser.add_argument(
+            "--recommendation_intent_labels",
+            nargs="+",
+            default=["REC-S", "REC-E"],
+            help="Intent labels corresponding to recommendation.",
+        )
+        parser.add_argument(
+            "--output",
+            type=str,
+            help="Output file to save annotated dialogues with utility metrics",
+        )
+        return parser.parse_args()
+
     def _annotate_dialogue(
         self, dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
     ) -> Dialogue:
@@ -211,37 +260,6 @@ def _get_intent_lists(self, **kwargs: Any) -> Tuple[List[Intent], ...]:
             [Intent(label) for label in rej_labels],
         )
 
-    def evaluate_dialogues(
-        self, dialogues: List[Dialogue], **kwargs: Any
-    ) -> Dict[str, Dict[str, float]]:
-        """Computes all utility metrics for every dialogue.
-
-        Overrides base to return full metrics dict per dialogue rather than
-        a single float, since utility evaluation aggregates SR, SRRR, and RDL.
-
-        Returns:
-            conversation_id -> metrics dict with keys: success,
-            successful_recommendation_round_ratio, reward_per_dialogue_length.
-        """
-        return {
-            dialogue.conversation_id: self._get_utility_metrics(
-                dialogue, **kwargs
-            )
-            for dialogue in dialogues
-        }
-
-    def evaluate_agents(
-        self, dialogues: List[Dialogue], **kwargs: Any
-    ) -> Dict[str, Dict[str, Dict[str, float]]]:
-        """Computes utility metrics per agent, returning full metrics per
-        dialogue.
-
-        Returns:
-            agent_id -> conversation_id -> metrics dict (success, srrr, rdl).
-        """
-        result = super().evaluate_agents(dialogues, **kwargs)
-        return cast(Dict[str, Dict[str, Dict[str, float]]], result)
-
     def _get_utility_metrics(
         self, dialogue: Dialogue, **kwargs: Any
     ) -> Dict[str, float]:
@@ -276,7 +294,7 @@ def _get_utility_metrics(
         }
 
     def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
-        """Computes one utility metric for a single dialogue.
+        """Computes one utility metric for a dialogue.
 
         Args:
             dialogue: Dialogue to evaluate.
diff --git a/tests/evaluation/test_quality_metric.py b/tests/evaluation/test_quality_metric.py
index 2c409806..d9882577 100644
--- a/tests/evaluation/test_quality_metric.py
+++ b/tests/evaluation/test_quality_metric.py
@@ -1,11 +1,8 @@
 """Tests for QualityMetric."""
 
 from unittest.mock import MagicMock, patch
-
 import pytest
-
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
-
 from scripts.evaluation.quality_metric import QualityMetric
 
 
@@ -21,7 +18,7 @@ def dialogues():
 
 @pytest.fixture
 def mock_ollama():
-    """Mock Ollama LLM interface that returns fixed score JSON."""
+    """Mock Ollama LLM interface."""
     interface = MagicMock()
     interface.get_llm_api_response.return_value = (
         '{"score": 4, "score_explanation": "good"}'
@@ -31,7 +28,6 @@ def mock_ollama():
 
 @pytest.fixture
 def metric(mock_ollama):
-    """QualityMetric with mocked Ollama interface."""
     with patch.object(
         QualityMetric, "_get_ollama_interface", return_value=mock_ollama
     ):
@@ -41,43 +37,30 @@ def metric(mock_ollama):
 def test_evaluate_dialogue(
     metric: QualityMetric, mock_ollama, dialogues
 ) -> None:
-    """Test evaluate_dialogue returns mean of aspect scores for a dialogue."""
+    """Test evaluate_dialogue returns score for REC_RELEVANCE aspect."""
     dialogue = dialogues[0]
-    score = metric.evaluate_dialogue(dialogue)
+    score = metric.evaluate_dialogue(dialogue, aspect="REC_RELEVANCE")
     assert score == 4.0
-    assert mock_ollama.get_llm_api_response.call_count == len(metric.rubrics)
+    assert mock_ollama.get_llm_api_response.call_count == 1
 
 
-def test_evaluate_dialogue_with_aspects(
+def test_evaluate_dialogue_different_aspect(
     metric: QualityMetric, mock_ollama, dialogues
 ) -> None:
-    """Test evaluate_dialogue with aspects kwarg calls LLM only for aspects."""
+    """Test evaluate_dialogue with FLUENCY aspect."""
     dialogue = dialogues[0]
-    aspects = ["REC_RELEVANCE", "FLUENCY"]
-    score = metric.evaluate_dialogue(dialogue, aspects=aspects)
+    score = metric.evaluate_dialogue(dialogue, aspect="FLUENCY")
     assert score == 4.0
-    assert mock_ollama.get_llm_api_response.call_count == 2
+    assert mock_ollama.get_llm_api_response.call_count == 1
 
 
 def test_evaluate_dialogues(
     metric: QualityMetric, mock_ollama, dialogues
 ) -> None:
-    """Test evaluate_dialogues returns conversation_id -> score."""
-    result = metric.evaluate_dialogues(dialogues)
+    """Test evaluate_dialogues with for COM_STYLE aspect."""
+    result = metric.evaluate_dialogues(dialogues, aspect="COM_STYLE")
     assert len(result) == len(dialogues)
     for dialogue in dialogues:
         assert dialogue.conversation_id in result
         assert result[dialogue.conversation_id] == 4.0
-    expected_calls = len(dialogues) * len(metric.rubrics)
-    assert mock_ollama.get_llm_api_response.call_count == expected_calls
-
-
-def test_evaluate_agents(metric: QualityMetric, dialogues) -> None:
-    """Test evaluate_agents returns agent_id -> {conversation_id -> score}."""
-    result = metric.evaluate_agents(dialogues)
-    assert "Agent" in result
-    agent_scores = result["Agent"]
-    assert len(agent_scores) == len(dialogues)
-    for dialogue in dialogues:
-        assert dialogue.conversation_id in agent_scores
-        assert agent_scores[dialogue.conversation_id] == 4.0
+    assert mock_ollama.get_llm_api_response.call_count == len(dialogues)
diff --git a/tests/evaluation/test_satisfaction_metric.py b/tests/evaluation/test_satisfaction_metric.py
index 71231505..4d48c3dc 100644
--- a/tests/evaluation/test_satisfaction_metric.py
+++ b/tests/evaluation/test_satisfaction_metric.py
@@ -1,11 +1,8 @@
 """Tests for SatisfactionMetric."""
 
 from unittest.mock import MagicMock
-
 import pytest
-
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
-
 from scripts.evaluation.satisfaction_metric import SatisfactionMetric
 
 
@@ -21,7 +18,7 @@ def dialogues():
 
 @pytest.fixture
 def mock_classifier():
-    """Mock satisfaction classifier that returns fixed scores."""
+    """Mock satisfaction classifier."""
     classifier = MagicMock()
     classifier.classify_last_n_dialogue = MagicMock(return_value=3.5)
     return classifier
@@ -29,40 +26,23 @@ def mock_classifier():
 
 @pytest.fixture
 def metric(mock_classifier):
-    """SatisfactionMetric with mocked classifier."""
     return SatisfactionMetric(classifier=mock_classifier)
 
 
-def test_evaluate_dialogue(
-    metric: SatisfactionMetric, mock_classifier, dialogues
-) -> None:
-    """Test evaluate_dialogue returns classifier score for a single dialogue."""
+def test_evaluate_dialogue(metric: SatisfactionMetric, dialogues) -> None:
+    """Test evaluate_dialogue for a single dialogue."""
     dialogue = dialogues[0]
     score = metric.evaluate_dialogue(dialogue)
     assert score == 3.5
-    mock_classifier.classify_last_n_dialogue.assert_called_once_with(
-        dialogue, last_n=None
-    )
 
 
 def test_evaluate_dialogues(
     metric: SatisfactionMetric, mock_classifier, dialogues
 ) -> None:
-    """Test evaluate_dialogues returns conversation_id -> score."""
+    """Test evaluate_dialogues for list of dialogues."""
     result = metric.evaluate_dialogues(dialogues)
     assert len(result) == len(dialogues)
     for dialogue in dialogues:
         assert dialogue.conversation_id in result
         assert result[dialogue.conversation_id] == 3.5
     assert mock_classifier.classify_last_n_dialogue.call_count == len(dialogues)
-
-
-def test_evaluate_agents(metric: SatisfactionMetric, dialogues) -> None:
-    """Test evaluate_agents returns agent_id -> {conversation_id -> score}."""
-    result = metric.evaluate_agents(dialogues)
-    assert "Agent" in result
-    agent_scores = result["Agent"]
-    assert len(agent_scores) == len(dialogues)
-    for dialogue in dialogues:
-        assert dialogue.conversation_id in agent_scores
-        assert agent_scores[dialogue.conversation_id] == 3.5
diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py
index e12b13dc..ec16d66c 100644
--- a/tests/evaluation/test_utility_metric.py
+++ b/tests/evaluation/test_utility_metric.py
@@ -27,7 +27,7 @@ def dialogues():
 
 
 @pytest.fixture
-def metric(dialogues):
+def metric():
     """UtilityMetric returning fixed metrics."""
     with patch.object(
         UtilityMetric, "_get_utility_metrics", return_value=FIXED_UTILITY
@@ -39,7 +39,7 @@ def metric(dialogues):
 
 
 def test_evaluate_dialogue(metric: UtilityMetric, dialogues) -> None:
-    """Test evaluate_dialogue returns selected metric as float."""
+    """Test evaluate_dialogue returns selected metric."""
     dialogue = dialogues[0]
     assert metric.evaluate_dialogue(dialogue) == 1.0
     assert metric.evaluate_dialogue(dialogue, metric="success") == 1.0
@@ -56,22 +56,20 @@ def test_evaluate_dialogue(metric: UtilityMetric, dialogues) -> None:
 
 
 def test_evaluate_dialogues(metric: UtilityMetric, dialogues) -> None:
-    """Test evaluate_dialogues returns conversation_id -> full metrics dict."""
+    """Test evaluate_dialogues returns conversation_id -> metric value."""
     result = metric.evaluate_dialogues(dialogues)
     assert len(result) == len(dialogues)
     for dialogue in dialogues:
         assert dialogue.conversation_id in result
-        assert result[dialogue.conversation_id] == FIXED_UTILITY
+        assert result[dialogue.conversation_id] == 1.0
 
 
-def test_evaluate_agents(metric: UtilityMetric, dialogues) -> None:
-    """Test evaluate_agents returns agent_id -> {conversation_id -> metrics
-    dict}."""
-    result = metric.evaluate_agents(dialogues)
-    assert "Agent" in result
-    agent_scores = result["Agent"]
-    assert len(agent_scores) == len(dialogues)
+def test_evaluate_dialogues_with_specified_metric(
+    metric: UtilityMetric, dialogues
+) -> None:
+    """Test evaluate_dialogues with specified metric."""
+    result = metric.evaluate_dialogues(
+        dialogues, metric="successful_recommendation_round_ratio"
+    )
     for dialogue in dialogues:
-        assert dialogue.conversation_id in agent_scores
-        conv_metrics = agent_scores[dialogue.conversation_id]
-        assert conv_metrics == FIXED_UTILITY
+        assert result[dialogue.conversation_id] == 0.5

From 777e00e32a64ae8496ca1abc488d5faacd6f09cc Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 14:24:48 +0100
Subject: [PATCH 14/38] improvement/233-create-classes-for-metrics remove old
 files

---
 scripts/evaluation/base_metric.py             |  48 -------
 scripts/evaluation/quality_evaluation.py      |  72 ----------
 scripts/evaluation/satisfaction_evaluation.py |  48 -------
 scripts/evaluation/satisfaction_metric.py     |   4 +-
 scripts/evaluation/utility_evaluation.py      | 126 ------------------
 5 files changed, 2 insertions(+), 296 deletions(-)
 delete mode 100644 scripts/evaluation/base_metric.py
 delete mode 100644 scripts/evaluation/quality_evaluation.py
 delete mode 100644 scripts/evaluation/satisfaction_evaluation.py
 delete mode 100644 scripts/evaluation/utility_evaluation.py

diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py
deleted file mode 100644
index c99399a2..00000000
--- a/scripts/evaluation/base_metric.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""Abstract base class for dialogue evaluation metrics."""
-
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List
-from dialoguekit.core.dialogue import Dialogue
-
-
-class BaseMetric(ABC):
-    def __init__(self, name: str) -> None:
-        """Initializes the metric.
-
-        Args:
-            name: Metric name.
-        """
-        self.name = name
-
-    @abstractmethod
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
-        """Computes the metric for a single dialogue.
-
-        Args:
-            dialogue: Single dialogue to score.
-            **kwargs: Additional arguments specific to the metric.
-
-        Raises:
-            NotImplementedError: When not implemented by a subclass.
-
-        Returns:
-            Score for the dialogue.
-        """
-        raise NotImplementedError()
-
-    def evaluate_dialogues(
-        self, dialogues: List[Dialogue], **kwargs: Any
-    ) -> Dict[str, float]:
-        """Computes the metric for every dialogue in a given list.
-
-        Args:
-            dialogues: Dialogues.
-            **kwargs: Additional arguments specific to the metric.
-
-        Returns:
-            Dictionary with result per dialogue. Keys are conversation IDs.
-        """
-        return {
-            dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs)
-            for dialogue in dialogues
-        }
diff --git a/scripts/evaluation/quality_evaluation.py b/scripts/evaluation/quality_evaluation.py
deleted file mode 100644
index 082adde3..00000000
--- a/scripts/evaluation/quality_evaluation.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""Script to evaluate dialogue quality using an LLM.
-
-The script evaluates dialogue quality with regards to five aspects:
-- Recommendation relevance
-- Communication style
-- Fluency
-- Conversational flow
-- Overall satisfaction
-
-Each aspect is scored between 1 and 5, where the scores are described in a
-dedicated rubric. The scoring is done using a large language model.
-"""
-
-import argparse
-import json
-import os
-from statistics import mean, stdev
-
-from dialoguekit.utils.dialogue_reader import json_to_dialogues
-
-from scripts.evaluation.quality_metric import QualityMetric
-
-
-def parse_args() -> argparse.Namespace:
-    """Parse command-line arguments.
-
-    Returns:
-        Parsed arguments.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dialogues",
-        type=str,
-        required=True,
-        help="Path to the dialogues.",
-    )
-    parser.add_argument(
-        "--ollama_config",
-        type=str,
-        required=True,
-        help="Path to the Ollama config file.",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        help="(optional) Path to the output file.",
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    # Load dialogues
-    dialogues = json_to_dialogues(args.dialogues)
-
-    metric = QualityMetric(args.ollama_config)
-    scores = metric.evaluate_agents(dialogues)
-
-    # Save scores (agent_id -> conversation_id -> score)
-    if args.output:
-        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
-        with open(args.output, "w") as f:
-            json.dump(scores, f, indent=2)
-
-    # Summary
-    for agent_id, agent_scores in scores.items():
-        score_values = list(agent_scores.values())
-        print(f"Scores for agent {agent_id}:")
-        avg_score = mean(score_values)
-        std_dev = stdev(score_values) if len(score_values) >= 2 else 0.0
-        print(f"Average score: {avg_score:.2f} (std dev: {std_dev:.2f})")
diff --git a/scripts/evaluation/satisfaction_evaluation.py b/scripts/evaluation/satisfaction_evaluation.py
deleted file mode 100644
index 4c2d1890..00000000
--- a/scripts/evaluation/satisfaction_evaluation.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""Automatic evaluation of dialogues.
-
-This script evaluates dialogues with regards to user satisfaction. It uses
-DialogueKit's satisfaction classifier, which assigns a score between 1 and 5.
-"""
-
-import argparse
-
-from dialoguekit.utils.dialogue_reader import json_to_dialogues
-from scripts.evaluation.satisfaction_metric import SatisfactionMetric
-
-
-def parse_args() -> argparse.Namespace:
-    """Parse command-line arguments.
-
-    Returns:
-        Parsed arguments.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dialogues",
-        type=str,
-        required=True,
-        help="Path to the dialogues.",
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    # Load dialogues
-    dialogues = json_to_dialogues(args.dialogues)
-    print(f"Loaded {len(dialogues)} dialogues.")
-
-    metric = SatisfactionMetric()
-    scores = metric.evaluate_agents(dialogues)
-
-    # Summary
-    for agent, agent_scores in scores.items():
-        avg_score = metric.get_average(agent_scores)
-        stdev_score = metric.get_stdev(agent_scores)
-        max_score = metric.get_max(agent_scores)
-        min_score = metric.get_min(agent_scores)
-        print(f"Agent: {agent} / Num. dialogues: {len(agent_scores)}")
-        print(f"Min score: {min_score}")
-        print(f"Max score: {max_score}")
-        print(f"Average score: {avg_score:.3f} (stdev: {stdev_score:.3f})")
diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py
index ac5915e0..e774ae11 100644
--- a/scripts/evaluation/satisfaction_metric.py
+++ b/scripts/evaluation/satisfaction_metric.py
@@ -27,8 +27,8 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
         """Computes the satisfaction score for a single dialogue."""
         return self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
 
-    @classmethod
-    def parse_args(self) -> argparse.Namespace:
+    @staticmethod
+    def parse_args() -> argparse.Namespace:
         """Parses command-line arguments.
 
         Returns:
diff --git a/scripts/evaluation/utility_evaluation.py b/scripts/evaluation/utility_evaluation.py
deleted file mode 100644
index 4106624c..00000000
--- a/scripts/evaluation/utility_evaluation.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""Automatic evaluation of dialogues with regards to utility.
-
-The script computes three user-centric utility metrics proposed by Bernard and
-Balog (2025):
-
-- Success Rate (SR)
-- Successful Recommendation Round Ratio (SRRR)
-- Reward-per-Dialogue-Length (RDL)
-
-Reference:
-Bernard, Nolwenn, and Krisztian Balog. "Limitations of Current Evaluation
-Practices for Conversational Recommender Systems and the Potential of User
-Simulation." arXiv preprint arXiv:2510.05624 (2025).
-https://arxiv.org/abs/2510.05624
-"""
-
-import argparse
-import json
-from collections import defaultdict
-from typing import Dict
-
-from dialoguekit.utils.dialogue_reader import json_to_dialogues
-from scripts.evaluation.utility_metric import UtilityMetric
-
-
-def get_summary(
-    scores: Dict[str, Dict[str, Dict[str, float]]],
-) -> None:
-    """Displays a summary of the utility evaluation.
-
-    Args:
-        scores: Agent_id -> conversation_id -> utility metrics dict.
-    """
-    summary: dict = defaultdict(
-        lambda: {"total_dialogues": 0, "success_rate": 0, "srrr": 0, "rdl": 0}
-    )
-    for agent_id, agent_scores in scores.items():
-        for conv_metrics in agent_scores.values():
-            summary[agent_id]["total_dialogues"] += 1
-            summary[agent_id]["success_rate"] += conv_metrics["success"]
-            summary[agent_id]["srrr"] += conv_metrics[
-                "successful_recommendation_round_ratio"
-            ]
-            summary[agent_id]["rdl"] += conv_metrics[
-                "reward_per_dialogue_length"
-            ]
-
-    for agent_id, stats in summary.items():
-        total = stats["total_dialogues"]
-        print(f"Agent: {agent_id}")
-        print(f"\tTotal Dialogues: {total}")
-        print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}")
-        print(
-            "\tSuccessful Recommendation Round Ratio: "
-            f"{stats['srrr'] / total:.4f}"
-        )
-        print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}")
-        print()
-
-
-def parse_args() -> argparse.Namespace:
-    """Parses command-line arguments.
-
-    Returns:
-        Parsed command-line arguments.
-    """
-    parser = argparse.ArgumentParser(prog="utility_evaluation.py")
-    parser.add_argument(
-        "annotated_dialogues",
-        type=str,
-        help="Annotated dialogues JSON file.",
-    )
-    parser.add_argument(
-        "user_nlu_config",
-        type=str,
-        help="User NLU configuration file.",
-    )
-    parser.add_argument(
-        "agent_nlu_config",
-        type=str,
-        help="Agent NLU configuration file.",
-    )
-    parser.add_argument(
-        "--reject_intent_labels",
-        nargs="+",
-        default=["REJ"],
-        help="Intent labels corresponding to rejection.",
-    )
-    parser.add_argument(
-        "--accept_intent_labels",
-        nargs="+",
-        default=["ACC"],
-        help="Intent labels corresponding to acceptance.",
-    )
-    parser.add_argument(
-        "--recommendation_intent_labels",
-        nargs="+",
-        default=["REC-S", "REC-E"],
-        help="Intent labels corresponding to recommendation.",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        help="Output file to save annotated dialogues with utility metrics.",
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    dialogues = json_to_dialogues(args.annotated_dialogues)
-
-    metric = UtilityMetric(args.user_nlu_config, args.agent_nlu_config)
-    scores = metric.evaluate_agents(
-        dialogues,
-        recommendation_intent_labels=args.recommendation_intent_labels,
-        acceptance_intent_labels=args.accept_intent_labels,
-        rejection_intent_labels=args.reject_intent_labels,
-    )
-
-    if args.output:
-        with open(args.output, "w") as f:
-            json.dump(scores, f, indent=2)
-
-    get_summary(scores)

From 5a55b43be7a33958abd42098b7da5c8eb29e1c6c Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 14:33:54 +0100
Subject: [PATCH 15/38] improvement/233-create-classes-for-metrics remove extra
 files

---
 scripts/__init__.py            | 1 -
 scripts/evaluation/__init__.py | 3 ---
 2 files changed, 4 deletions(-)
 delete mode 100644 scripts/__init__.py
 delete mode 100644 scripts/evaluation/__init__.py

diff --git a/scripts/__init__.py b/scripts/__init__.py
deleted file mode 100644
index 5100bd2d..00000000
--- a/scripts/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Scripts package marker to avoid namespace package ambiguity for mypy."""
diff --git a/scripts/evaluation/__init__.py b/scripts/evaluation/__init__.py
deleted file mode 100644
index ad40101c..00000000
--- a/scripts/evaluation/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""Evaluation helpers package to make imports explicit for type checking."""
-
-__all__: list[str] = []

From c5bb1002177fad0fab58b4956f246964f482fae0 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 14:38:46 +0100
Subject: [PATCH 16/38] improvement/233-create-classes-for-metrics remove
 changes

---
 usersimcrs/nlu/llm/__init__.py       | 12 ++++++------
 usersimcrs/utils/simulation_utils.py |  9 +++------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/usersimcrs/nlu/llm/__init__.py b/usersimcrs/nlu/llm/__init__.py
index 3c608547..be592d99 100644
--- a/usersimcrs/nlu/llm/__init__.py
+++ b/usersimcrs/nlu/llm/__init__.py
@@ -1,9 +1,9 @@
 """Module level init for LLM-based NLU components."""
 
-"""Module level init for LLM-based NLU components.
+from usersimcrs.nlu.llm.llm_dialogue_act_extractor import (
+    LLMDialogueActsExtractor,
+)
 
-Avoid importing heavy submodules at package import time to keep test
-collection lightweight; import submodules explicitly when needed.
-"""
-
-__all__ = ["LLMDialogueActsExtractor"]
+__all__ = [
+    "LLMDialogueActsExtractor",
+]
diff --git a/usersimcrs/utils/simulation_utils.py b/usersimcrs/utils/simulation_utils.py
index b0ed0c9f..6121723e 100644
--- a/usersimcrs/utils/simulation_utils.py
+++ b/usersimcrs/utils/simulation_utils.py
@@ -142,12 +142,9 @@ def _get_agenda_based_simulator_config(
 
     ratings = Ratings(item_collection)
     ratings.load_ratings_csv(file_path=config["ratings"].get())
-    raw = config["historical_ratings_ratio"].get()
-    if raw is None:
-        historical_ratio = 0.8
-    else:
-        historical_ratio = float(raw)
-    historical_ratings, _ = ratings.create_split(historical_ratio)
+    historical_ratings, _ = ratings.create_split(
+        config["historical_ratings_ratio"].get(0.8)
+    )
 
     preference_model = SimplePreferenceModel(
         domain,

From a1fac9f593f508df97d8f20cea900c5f8878c081 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 14:59:49 +0100
Subject: [PATCH 17/38] improvement/233-create-classes-for-metrics remove
 changes

---
 scripts/evaluation/quality_metric.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/evaluation/quality_metric.py b/scripts/evaluation/quality_metric.py
index a6e0475f..5a2323fd 100644
--- a/scripts/evaluation/quality_metric.py
+++ b/scripts/evaluation/quality_metric.py
@@ -135,8 +135,8 @@ def evaluate_dialogue(
             response = response.replace("\\", "\\\\")
             response_dict = json.loads(response)
             return float(response_dict["score"])
-        except Exception as e:
+        except Exception:
             raise ValueError(
                 f"Failed to get score for {aspect} dialogue "
                 f"{dialogue.conversation_id}: {response}"
-            ) from e
+            )

From 4ed5727280def14d42ddd615d3196d7e6d872879 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 15:32:21 +0100
Subject: [PATCH 18/38] move file

---
 scripts/evaluation/satisfaction_metric.py     | 44 +++++++++++++++++++
 usersimcrs/evaluation/__init__.py             |  5 +++
 .../evaluation/base_metric.py                 |  0
 3 files changed, 49 insertions(+)
 create mode 100644 scripts/evaluation/satisfaction_metric.py
 create mode 100644 usersimcrs/evaluation/__init__.py
 rename {scripts => usersimcrs}/evaluation/base_metric.py (100%)

diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py
new file mode 100644
index 00000000..78bb6754
--- /dev/null
+++ b/scripts/evaluation/satisfaction_metric.py
@@ -0,0 +1,44 @@
+"""Satisfaction metric class implementation.
+
+Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class.
+"""
+
+from typing import Any, Optional
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.nlu.models.satisfaction_classifier import (
+    SatisfactionClassifierSVM,
+)
+import argparse
+
+from evaluation.base_metric import BaseMetric
+
+
+class SatisfactionMetric(BaseMetric):
+    def __init__(
+        self,
+        classifier: Optional[SatisfactionClassifierSVM] = None,
+        name: str = "satisfaction",
+    ):
+        super().__init__(name)
+        self.classifier = classifier or SatisfactionClassifierSVM()
+
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the satisfaction score for a single dialogue."""
+        return self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
+
+    @staticmethod
+    def parse_args() -> argparse.Namespace:
+        """Parses command-line arguments.
+
+        Returns:
+            Parsed arguments.
+        """
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            "--dialogues",
+            type=str,
+            required=True,
+            help="Path to the dialogues.",
+        )
+        return parser.parse_args()
diff --git a/usersimcrs/evaluation/__init__.py b/usersimcrs/evaluation/__init__.py
new file mode 100644
index 00000000..c55a4339
--- /dev/null
+++ b/usersimcrs/evaluation/__init__.py
@@ -0,0 +1,5 @@
+"""Evaluation metrics for dialogue systems."""
+
+from usersimcrs.evaluation.base_metric import BaseMetric
+
+__all__ = ["BaseMetric"]
diff --git a/scripts/evaluation/base_metric.py b/usersimcrs/evaluation/base_metric.py
similarity index 100%
rename from scripts/evaluation/base_metric.py
rename to usersimcrs/evaluation/base_metric.py

From e6f0ef37e7309c248f4d315da958b111dae97db8 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 15:33:47 +0100
Subject: [PATCH 19/38] move file

---
 scripts/evaluation/satisfaction_metric.py | 44 -----------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 scripts/evaluation/satisfaction_metric.py

diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py
deleted file mode 100644
index 78bb6754..00000000
--- a/scripts/evaluation/satisfaction_metric.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""Satisfaction metric class implementation.
-
-Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class.
-"""
-
-from typing import Any, Optional
-
-from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.nlu.models.satisfaction_classifier import (
-    SatisfactionClassifierSVM,
-)
-import argparse
-
-from evaluation.base_metric import BaseMetric
-
-
-class SatisfactionMetric(BaseMetric):
-    def __init__(
-        self,
-        classifier: Optional[SatisfactionClassifierSVM] = None,
-        name: str = "satisfaction",
-    ):
-        super().__init__(name)
-        self.classifier = classifier or SatisfactionClassifierSVM()
-
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
-        """Computes the satisfaction score for a single dialogue."""
-        return self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
-
-    @staticmethod
-    def parse_args() -> argparse.Namespace:
-        """Parses command-line arguments.
-
-        Returns:
-            Parsed arguments.
-        """
-        parser = argparse.ArgumentParser()
-        parser.add_argument(
-            "--dialogues",
-            type=str,
-            required=True,
-            help="Path to the dialogues.",
-        )
-        return parser.parse_args()

From 431ae4041630683b38bd8a989b1d380b80e74c04 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 3 Mar 2026 19:33:56 +0100
Subject: [PATCH 20/38] refactoring

---
 scripts/evaluation/satisfaction_metric.py     |  44 ---
 tests/evaluation/test_quality_metric.py       |  27 +-
 tests/evaluation/test_satisfaction_metric.py  |   2 +-
 tests/evaluation/test_utility_metric.py       | 131 +++++---
 .../evaluation/quality_metric.py              |  80 ++---
 .../evaluation}/quality_rubrics.py            |   0
 usersimcrs/evaluation/satisfaction_metric.py  |  27 ++
 .../evaluation/utility_metric.py              | 287 ++++++++----------
 8 files changed, 273 insertions(+), 325 deletions(-)
 delete mode 100644 scripts/evaluation/satisfaction_metric.py
 rename {scripts => usersimcrs}/evaluation/quality_metric.py (56%)
 rename {scripts/evaluation/rubrics => usersimcrs/evaluation}/quality_rubrics.py (100%)
 create mode 100644 usersimcrs/evaluation/satisfaction_metric.py
 rename {scripts => usersimcrs}/evaluation/utility_metric.py (55%)

diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py
deleted file mode 100644
index e774ae11..00000000
--- a/scripts/evaluation/satisfaction_metric.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""Satisfaction metric class implementation.
-
-Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class.
-"""
-
-from typing import Any, Optional
-
-from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.nlu.models.satisfaction_classifier import (
-    SatisfactionClassifierSVM,
-)
-import argparse
-
-from scripts.evaluation.base_metric import BaseMetric
-
-
-class SatisfactionMetric(BaseMetric):
-    def __init__(
-        self,
-        classifier: Optional[SatisfactionClassifierSVM] = None,
-        name: str = "satisfaction",
-    ):
-        super().__init__(name)
-        self.classifier = classifier or SatisfactionClassifierSVM()
-
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
-        """Computes the satisfaction score for a single dialogue."""
-        return self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
-
-    @staticmethod
-    def parse_args() -> argparse.Namespace:
-        """Parses command-line arguments.
-
-        Returns:
-            Parsed arguments.
-        """
-        parser = argparse.ArgumentParser()
-        parser.add_argument(
-            "--dialogues",
-            type=str,
-            required=True,
-            help="Path to the dialogues.",
-        )
-        return parser.parse_args()
diff --git a/tests/evaluation/test_quality_metric.py b/tests/evaluation/test_quality_metric.py
index d9882577..c6dac1b3 100644
--- a/tests/evaluation/test_quality_metric.py
+++ b/tests/evaluation/test_quality_metric.py
@@ -1,9 +1,9 @@
 """Tests for QualityMetric."""
 
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 import pytest
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
-from scripts.evaluation.quality_metric import QualityMetric
+from usersimcrs.evaluation.quality_metric import QualityMetric
 
 
 @pytest.fixture
@@ -17,8 +17,8 @@ def dialogues():
 
 
 @pytest.fixture
-def mock_ollama():
-    """Mock Ollama LLM interface."""
+def mock_llm_interface():
+    """Mock LLM interface."""
     interface = MagicMock()
     interface.get_llm_api_response.return_value = (
         '{"score": 4, "score_explanation": "good"}'
@@ -27,35 +27,32 @@ def mock_ollama():
 
 
 @pytest.fixture
-def metric(mock_ollama):
-    with patch.object(
-        QualityMetric, "_get_ollama_interface", return_value=mock_ollama
-    ):
-        yield QualityMetric(ollama_config_path="dummy_config.json")
+def metric(mock_llm_interface):
+    return QualityMetric(llm_interface=mock_llm_interface)
 
 
 def test_evaluate_dialogue(
-    metric: QualityMetric, mock_ollama, dialogues
+    metric: QualityMetric, mock_llm_interface, dialogues
 ) -> None:
     """Test evaluate_dialogue returns score for REC_RELEVANCE aspect."""
     dialogue = dialogues[0]
     score = metric.evaluate_dialogue(dialogue, aspect="REC_RELEVANCE")
     assert score == 4.0
-    assert mock_ollama.get_llm_api_response.call_count == 1
+    assert mock_llm_interface.get_llm_api_response.call_count == 1
 
 
 def test_evaluate_dialogue_different_aspect(
-    metric: QualityMetric, mock_ollama, dialogues
+    metric: QualityMetric, mock_llm_interface, dialogues
 ) -> None:
     """Test evaluate_dialogue with FLUENCY aspect."""
     dialogue = dialogues[0]
     score = metric.evaluate_dialogue(dialogue, aspect="FLUENCY")
     assert score == 4.0
-    assert mock_ollama.get_llm_api_response.call_count == 1
+    assert mock_llm_interface.get_llm_api_response.call_count == 1
 
 
 def test_evaluate_dialogues(
-    metric: QualityMetric, mock_ollama, dialogues
+    metric: QualityMetric, mock_llm_interface, dialogues
 ) -> None:
     """Test evaluate_dialogues with for COM_STYLE aspect."""
     result = metric.evaluate_dialogues(dialogues, aspect="COM_STYLE")
@@ -63,4 +60,4 @@ def test_evaluate_dialogues(
     for dialogue in dialogues:
         assert dialogue.conversation_id in result
         assert result[dialogue.conversation_id] == 4.0
-    assert mock_ollama.get_llm_api_response.call_count == len(dialogues)
+    assert mock_llm_interface.get_llm_api_response.call_count == len(dialogues)
diff --git a/tests/evaluation/test_satisfaction_metric.py b/tests/evaluation/test_satisfaction_metric.py
index 4d48c3dc..787c175b 100644
--- a/tests/evaluation/test_satisfaction_metric.py
+++ b/tests/evaluation/test_satisfaction_metric.py
@@ -3,7 +3,7 @@
 from unittest.mock import MagicMock
 import pytest
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
-from scripts.evaluation.satisfaction_metric import SatisfactionMetric
+from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric
 
 
 @pytest.fixture
diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py
index ec16d66c..37eedb25 100644
--- a/tests/evaluation/test_utility_metric.py
+++ b/tests/evaluation/test_utility_metric.py
@@ -1,4 +1,4 @@
-"""Tests for UtilityMetric."""
+"""Tests for utility metric classes."""
 
 from unittest.mock import patch
 
@@ -6,7 +6,11 @@
 
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
 
-from scripts.evaluation.utility_metric import UtilityMetric
+from usersimcrs.evaluation.utility_metric import (
+    RewardPerDialogueLengthMetric,
+    SuccessRateMetric,
+    SuccessfulRecommendationRoundRatioMetric,
+)
 
 
 @pytest.fixture
@@ -19,57 +23,96 @@ def dialogues():
     )
 
 
-FIXED_UTILITY = {
-    "success": 1.0,
-    "successful_recommendation_round_ratio": 0.5,
-    "reward_per_dialogue_length": 0.1,
-}
+@pytest.fixture
+def success_rate_metric():
+    return SuccessRateMetric(
+        user_nlu_config_path="dummy_user_nlu.yaml",
+        agent_nlu_config_path="dummy_agent_nlu.yaml",
+    )
+
+
+@pytest.fixture
+def successful_round_ratio_metric():
+    return SuccessfulRecommendationRoundRatioMetric(
+        user_nlu_config_path="dummy_user_nlu.yaml",
+        agent_nlu_config_path="dummy_agent_nlu.yaml",
+    )
 
 
 @pytest.fixture
-def metric():
-    """UtilityMetric returning fixed metrics."""
-    with patch.object(
-        UtilityMetric, "_get_utility_metrics", return_value=FIXED_UTILITY
+def reward_per_dialogue_length_metric():
+    return RewardPerDialogueLengthMetric(
+        user_nlu_config_path="dummy_user_nlu.yaml",
+        agent_nlu_config_path="dummy_agent_nlu.yaml",
+    )
+
+
+def test_success_rate_evaluate_dialogue(
+    success_rate_metric: SuccessRateMetric, dialogues
+) -> None:
+    """Test SuccessRateMetric.evaluate_dialogue."""
+    dialogue = dialogues[0]
+    with (
+        patch.object(
+            SuccessRateMetric, "_prepare", return_value=(dialogue, [], [], [])
+        ),
+        patch.object(SuccessRateMetric, "_assess_dialogue", return_value=1),
     ):
-        yield UtilityMetric(
-            user_nlu_config_path="dummy_user_nlu.yaml",
-            agent_nlu_config_path="dummy_agent_nlu.yaml",
-        )
+        assert success_rate_metric.evaluate_dialogue(dialogue) == 1.0
 
 
-def test_evaluate_dialogue(metric: UtilityMetric, dialogues) -> None:
-    """Test evaluate_dialogue returns selected metric."""
+def test_success_rate_evaluate_dialogue_unsuccessful(
+    success_rate_metric: SuccessRateMetric, dialogues
+) -> None:
+    """Test SuccessRateMetric.evaluate_dialogue for failed dialogue."""
     dialogue = dialogues[0]
-    assert metric.evaluate_dialogue(dialogue) == 1.0
-    assert metric.evaluate_dialogue(dialogue, metric="success") == 1.0
-    assert (
-        metric.evaluate_dialogue(
-            dialogue, metric="successful_recommendation_round_ratio"
-        )
-        == 0.5
-    )
-    assert (
-        metric.evaluate_dialogue(dialogue, metric="reward_per_dialogue_length")
-        == 0.1
-    )
+    with (
+        patch.object(
+            SuccessRateMetric, "_prepare", return_value=(dialogue, [], [], [])
+        ),
+        patch.object(SuccessRateMetric, "_assess_dialogue", return_value=0),
+    ):
+        assert success_rate_metric.evaluate_dialogue(dialogue) == 0.0
 
 
-def test_evaluate_dialogues(metric: UtilityMetric, dialogues) -> None:
-    """Test evaluate_dialogues returns conversation_id -> metric value."""
-    result = metric.evaluate_dialogues(dialogues)
-    assert len(result) == len(dialogues)
-    for dialogue in dialogues:
-        assert dialogue.conversation_id in result
-        assert result[dialogue.conversation_id] == 1.0
+def test_successful_recommendation_round_ratio_evaluate_dialogue(
+    successful_round_ratio_metric: SuccessfulRecommendationRoundRatioMetric,
+    dialogues,
+) -> None:
+    """Test SuccessfulRecommendationRoundRatioMetric.evaluate_dialogue."""
+    dialogue = dialogues[0]
+    with (
+        patch.object(
+            SuccessfulRecommendationRoundRatioMetric,
+            "_prepare",
+            return_value=(dialogue, [], [], []),
+        ),
+        patch.object(
+            SuccessfulRecommendationRoundRatioMetric,
+            "_assess_dialogue",
+            return_value=(1, 2),
+        ),
+    ):
+        assert successful_round_ratio_metric.evaluate_dialogue(dialogue) == 0.5
 
 
-def test_evaluate_dialogues_with_specified_metric(
-    metric: UtilityMetric, dialogues
+def test_reward_per_dialogue_length_evaluate_dialogue(
+    reward_per_dialogue_length_metric: RewardPerDialogueLengthMetric, dialogues
 ) -> None:
-    """Test evaluate_dialogues with specified metric."""
-    result = metric.evaluate_dialogues(
-        dialogues, metric="successful_recommendation_round_ratio"
-    )
-    for dialogue in dialogues:
-        assert result[dialogue.conversation_id] == 0.5
+    """Test RewardPerDialogueLengthMetric.evaluate_dialogue."""
+    dialogue = dialogues[0]
+    with (
+        patch.object(
+            RewardPerDialogueLengthMetric,
+            "_prepare",
+            return_value=(dialogue, [], [], []),
+        ),
+        patch.object(
+            RewardPerDialogueLengthMetric,
+            "_assess_dialogue",
+            return_value=(1, 10),
+        ),
+    ):
+        assert (
+            reward_per_dialogue_length_metric.evaluate_dialogue(dialogue) == 0.1
+        )
diff --git a/scripts/evaluation/quality_metric.py b/usersimcrs/evaluation/quality_metric.py
similarity index 56%
rename from scripts/evaluation/quality_metric.py
rename to usersimcrs/evaluation/quality_metric.py
index 5a2323fd..04e7bf94 100644
--- a/scripts/evaluation/quality_metric.py
+++ b/usersimcrs/evaluation/quality_metric.py
@@ -1,4 +1,4 @@
-"""Script to evaluate dialogue quality using an LLM.
+"""LLM-based dialogue quality evaluation.
 
 The script evaluates dialogue quality with regards to five aspects:
 - Recommendation relevance
@@ -11,19 +11,16 @@
 dedicated rubric. The scoring is done using a large language model.
 """
 
-import argparse
 import json
-from typing import Any, Optional, TYPE_CHECKING
-
-if TYPE_CHECKING:
-    pass
+import logging
+from typing import Any
 
 from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.participant.participant import DialogueParticipant
 
-from scripts.evaluation.base_metric import BaseMetric
-from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics
-from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface
+from usersimcrs.evaluation.base_metric import BaseMetric
+from usersimcrs.evaluation.quality_rubrics import QualityRubrics
+from usersimcrs.llm_interfaces.llm_interface import LLMInterface
 
 
 _PROMPT_EVAL_INTRO = (
@@ -42,50 +39,11 @@
 class QualityMetric(BaseMetric):
     def __init__(
         self,
-        ollama_config_path: str,
-        default_response: str = "",
+        llm_interface: LLMInterface,
         name: str = "quality",
     ) -> None:
         super().__init__(name)
-        self.ollama_config_path = ollama_config_path
-        self.default_response = default_response
-        self._ollama_interface: Optional[OllamaLLMInterface] = None
-
-    @staticmethod
-    def parse_args() -> argparse.Namespace:
-        """Parse command-line arguments.
-
-        Returns:
-            Parsed arguments.
-        """
-        parser = argparse.ArgumentParser()
-        parser.add_argument(
-            "--dialogues",
-            type=str,
-            required=True,
-            help="Path to the dialogues.",
-        )
-        parser.add_argument(
-            "--ollama_config",
-            type=str,
-            required=True,
-            help="Path to the Ollama config file.",
-        )
-        parser.add_argument(
-            "--output",
-            type=str,
-            help="(optional) Path to the output file.",
-        )
-        return parser.parse_args()
-
-    def _get_ollama_interface(self) -> OllamaLLMInterface:
-        """Returns Ollama LLM interface."""
-        if self._ollama_interface is None:
-            self._ollama_interface = OllamaLLMInterface(
-                self.ollama_config_path,
-                default_response=self.default_response,
-            )
-        return self._ollama_interface
+        self.llm_interface = llm_interface
 
     def _get_prompt(
         self, grading_rubric: QualityRubrics, dialogue: Dialogue
@@ -125,18 +83,26 @@ def evaluate_dialogue(
             Score (1-5) for the specified aspect.
 
         Raises:
-            ValueError: When the LLM response cannot be parsed.
+            KeyError: When the aspect does not exist in QualityRubrics.
         """
-        aspect_enum = QualityRubrics[aspect]
-        ollama_interface = self._get_ollama_interface()
+        try:
+            aspect_enum = QualityRubrics[aspect]
+        except KeyError:
+            supported = [e.name for e in QualityRubrics]
+            raise KeyError(
+                f"Unknown aspect '{aspect}'. Supported aspects: {supported}"
+            )
         prompt = self._get_prompt(aspect_enum, dialogue)
-        response = ollama_interface.get_llm_api_response(prompt)
+        response = self.llm_interface.get_llm_api_response(prompt)
         try:
             response = response.replace("\\", "\\\\")
             response_dict = json.loads(response)
             return float(response_dict["score"])
         except Exception:
-            raise ValueError(
-                f"Failed to get score for {aspect} dialogue "
-                f"{dialogue.conversation_id}: {response}"
+            logging.warning(
+                "Failed to parse LLM response for %s dialogue %s: %s",
+                aspect,
+                dialogue.conversation_id,
+                response,
             )
+            return 0.0
diff --git a/scripts/evaluation/rubrics/quality_rubrics.py b/usersimcrs/evaluation/quality_rubrics.py
similarity index 100%
rename from scripts/evaluation/rubrics/quality_rubrics.py
rename to usersimcrs/evaluation/quality_rubrics.py
diff --git a/usersimcrs/evaluation/satisfaction_metric.py b/usersimcrs/evaluation/satisfaction_metric.py
new file mode 100644
index 00000000..d05fdbbb
--- /dev/null
+++ b/usersimcrs/evaluation/satisfaction_metric.py
@@ -0,0 +1,27 @@
+"""Satisfaction metric class implementation.
+
+Satisfaction assessment based on DialogueKit classifier.
+"""
+
+from typing import Any
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.nlu.models.satisfaction_classifier import (
+    SatisfactionClassifier,
+)
+
+from usersimcrs.evaluation.base_metric import BaseMetric
+
+
+class SatisfactionMetric(BaseMetric):
+    def __init__(
+        self,
+        classifier: SatisfactionClassifier,
+        name: str = "satisfaction",
+    ):
+        super().__init__(name)
+        self.classifier = classifier
+
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the satisfaction score for a single dialogue."""
+        return self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
diff --git a/scripts/evaluation/utility_metric.py b/usersimcrs/evaluation/utility_metric.py
similarity index 55%
rename from scripts/evaluation/utility_metric.py
rename to usersimcrs/evaluation/utility_metric.py
index 88617683..c9932d50 100644
--- a/scripts/evaluation/utility_metric.py
+++ b/usersimcrs/evaluation/utility_metric.py
@@ -1,33 +1,32 @@
 """Utility metric class implementation.
 
-Encapsulates the logic from `utility_evaluation.py` into a `BaseMetric`.
+Computes three  utility metrics:
+
+- Success Rate (SR)
+- Successful Recommendation Round Ratio (SRRR)
+- Reward-per-Dialogue-Length (RDL)
 """
 
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, List, Optional, Tuple
 
 from confuse import Configuration
-import argparse
 
 from dialoguekit.core.annotated_utterance import AnnotatedUtterance
 from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.core.intent import Intent
 from dialoguekit.nlu.nlu import NLU
 from dialoguekit.participant.participant import DialogueParticipant
-from usersimcrs.utils.simulation_utils import get_NLU
-from scripts.evaluation.base_metric import BaseMetric
 
+from usersimcrs.evaluation.base_metric import BaseMetric
+from usersimcrs.utils.simulation_utils import get_NLU
 
-class UtilityMetric(BaseMetric):
-    """Computes utility metrics for dialogues.
-
-    Constructor takes paths to user and agent NLU configuration files.
-    """
 
+class UtilityMetricBase(BaseMetric):
     def __init__(
         self,
         user_nlu_config_path: str,
         agent_nlu_config_path: str,
-        name: str = "utility",
+        name: str,
     ):
         super().__init__(name)
         self.user_nlu_config_path = user_nlu_config_path
@@ -35,54 +34,6 @@ def __init__(
         self._user_nlu: Optional[NLU] = None
         self._agent_nlu: Optional[NLU] = None
 
-    @classmethod
-    def parse_args(self) -> argparse.Namespace:
-        """Parses command-line arguments.
-
-        Returns:
-            Parsed command-line arguments.
-        """
-        parser = argparse.ArgumentParser(prog="utility_evaluation.py")
-        parser.add_argument(
-            "annotated_dialogues",
-            type=str,
-            help="Annotated dialogues JSON file.",
-        )
-        parser.add_argument(
-            "user_nlu_config",
-            type=str,
-            help="User NLU configuration file.",
-        )
-        parser.add_argument(
-            "agent_nlu_config",
-            type=str,
-            help="Agent NLU configuration file.",
-        )
-        parser.add_argument(
-            "--reject_intent_labels",
-            nargs="+",
-            default=["REJ"],
-            help="Intent labels corresponding to rejection.",
-        )
-        parser.add_argument(
-            "--accept_intent_labels",
-            nargs="+",
-            default=["ACC"],
-            help="Intent labels corresponding to acceptance.",
-        )
-        parser.add_argument(
-            "--recommendation_intent_labels",
-            nargs="+",
-            default=["REC-S", "REC-E"],
-            help="Intent labels corresponding to recommendation.",
-        )
-        parser.add_argument(
-            "--output",
-            type=str,
-            help="Output file to save annotated dialogues with utility metrics",
-        )
-        return parser.parse_args()
-
     def _annotate_dialogue(
         self, dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
     ) -> Dialogue:
@@ -119,26 +70,6 @@ def _annotate_dialogue(
                 )
         return dialogue
 
-    def _annotate_dialogues(
-        self, dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU
-    ) -> List[Dialogue]:
-        """Annotates dialogues with dialogue acts.
-
-        Args:
-            dialogues: Dialogues.
-            user_nlu: User NLU module.
-            agent_nlu: Agent NLU module.
-
-        Returns:
-            Annotated dialogues.
-        """
-        # TODO: Move this to DialogueKit
-        # See: https://github.com/iai-group/UserSimCRS/issues/219
-        return [
-            self._annotate_dialogue(dialogue, user_nlu, agent_nlu)
-            for dialogue in dialogues
-        ]
-
     def _get_recommendation_rounds(
         self, dialogue: Dialogue, recommendation_intents: List[Intent]
     ) -> List[List[AnnotatedUtterance]]:
@@ -191,48 +122,6 @@ def _is_recommendation_accepted(
                     return False
         return b_accepted
 
-    def _assess_dialogue(
-        self,
-        dialogue: Dialogue,
-        recommendation_intents: List[Intent],
-        acceptance_intents: List[Intent],
-        rejection_intents: List[Intent],
-    ) -> Tuple[int, int, int]:
-        """Assesses the utility of the dialogue.
-
-        Args:
-            dialogue: Dialogue.
-            recommendation_intents: Intents corresponding to recommendation.
-            acceptance_intents: Intents corresponding to acceptance.
-            rejection_intents: Intents corresponding to rejection.
-
-        Returns:
-            Tuple of number of accepted recommendations, successful
-                recommendation rounds and total recommendation rounds.
-        """
-        # TODO: Optimize overall assessment to avoid multiple iterations over
-        # utterances.
-        rounds = self._get_recommendation_rounds(
-            dialogue, recommendation_intents
-        )
-        successful_rounds = 0
-        for round in rounds:
-            if self._is_recommendation_accepted(
-                round, acceptance_intents, rejection_intents
-            ):
-                successful_rounds += 1
-
-        nb_accepted_recommendations = sum(
-            1
-            for utterance in dialogue.utterances
-            if utterance.participant == DialogueParticipant.USER
-            and any(
-                intent in acceptance_intents
-                for intent in utterance.get_intents()
-            )
-        )
-        return nb_accepted_recommendations, successful_rounds, len(rounds)
-
     def _load_nlus(self) -> Tuple[NLU, NLU]:
         """Returns (cached) user and agent NLU modules."""
         if self._user_nlu is None:
@@ -260,55 +149,125 @@ def _get_intent_lists(self, **kwargs: Any) -> Tuple[List[Intent], ...]:
             [Intent(label) for label in rej_labels],
         )
 
-    def _get_utility_metrics(
+    def _prepare(
         self, dialogue: Dialogue, **kwargs: Any
-    ) -> Dict[str, float]:
-        """Returns full utility dict for one dialogue."""
+    ) -> Tuple[Dialogue, List[Intent], List[Intent], List[Intent]]:
+        """Annotates dialogue.
+
+        Returns:
+            dialogue
+            rec_intents: Recommendation intents.
+            acc_intents: Acceptance intents.
+            rej_intents: Rejection intents.
+        """
         user_nlu, agent_nlu = self._load_nlus()
         self._annotate_dialogue(dialogue, user_nlu, agent_nlu)
-        (
-            recommendation_intents,
-            acceptance_intents,
-            rejection_intents,
-        ) = self._get_intent_lists(**kwargs)
-        (
-            nb_accepted_recommendations,
-            successful_rounds,
-            total_rounds,
-        ) = self._assess_dialogue(
-            dialogue,
-            recommendation_intents,
-            acceptance_intents,
-            rejection_intents,
+        rec, acc, rej = self._get_intent_lists(**kwargs)
+        return dialogue, rec, acc, rej
+
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the metric for a single dialogue."""
+        raise NotImplementedError()
+
+
+class SuccessRateMetric(UtilityMetricBase):
+    def __init__(
+        self,
+        user_nlu_config_path: str,
+        agent_nlu_config_path: str,
+        name: str = "success_rate",
+    ):
+        super().__init__(user_nlu_config_path, agent_nlu_config_path, name)
+
+    def _assess_dialogue(
+        self,
+        dialogue: Dialogue,
+        recommendation_intents: List[Intent],
+        acceptance_intents: List[Intent],
+        rejection_intents: List[Intent],
+    ) -> int:
+        """Returns number of successful recommendation rounds."""
+        rounds = self._get_recommendation_rounds(
+            dialogue, recommendation_intents
+        )
+        return sum(
+            1
+            for round_utterances in rounds
+            if self._is_recommendation_accepted(
+                round_utterances, acceptance_intents, rejection_intents
+            )
         )
-        return {
-            "success": float(successful_rounds > 0),
-            "successful_recommendation_round_ratio": (
-                successful_rounds / total_rounds if total_rounds > 0 else 0.0
-            ),
-            "reward_per_dialogue_length": (
-                nb_accepted_recommendations / len(dialogue.utterances)
-                if dialogue.utterances
-                else 0.0
-            ),
-        }
 
     def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
-        """Computes one utility metric for a dialogue.
+        dlg, rec, acc, rej = self._prepare(dialogue, **kwargs)
+        successful_rounds = self._assess_dialogue(dlg, rec, acc, rej)
+        return float(successful_rounds > 0)
 
-        Args:
-            dialogue: Dialogue to evaluate.
-            metric: One of "success", "successful_recommendation_round_ratio",
-                "reward_per_dialogue_length". Default "success".
 
-        Returns:
-            The selected metric value as float.
-        """
-        metrics = self._get_utility_metrics(dialogue, **kwargs)
-        metric = kwargs.get("metric", "success")
-        if metric not in metrics:
-            raise ValueError(
-                f"Unknown metric '{metric}'. "
-                f"Expected one of {list(metrics.keys())}"
+class SuccessfulRecommendationRoundRatioMetric(UtilityMetricBase):
+    def __init__(
+        self,
+        user_nlu_config_path: str,
+        agent_nlu_config_path: str,
+        name: str = "successful_recommendation_round_ratio",
+    ):
+        super().__init__(user_nlu_config_path, agent_nlu_config_path, name)
+
+    def _assess_dialogue(
+        self,
+        dialogue: Dialogue,
+        recommendation_intents: List[Intent],
+        acceptance_intents: List[Intent],
+        rejection_intents: List[Intent],
+    ) -> Tuple[int, int]:
+        """Returns successful rounds and total rounds."""
+        rounds = self._get_recommendation_rounds(
+            dialogue, recommendation_intents
+        )
+        successful_rounds = sum(
+            1
+            for round_utterances in rounds
+            if self._is_recommendation_accepted(
+                round_utterances, acceptance_intents, rejection_intents
             )
-        return metrics[metric]
+        )
+        return successful_rounds, len(rounds)
+
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        dlg, rec, acc, rej = self._prepare(dialogue, **kwargs)
+        successful_rounds, total_rounds = self._assess_dialogue(
+            dlg, rec, acc, rej
+        )
+        return successful_rounds / total_rounds if total_rounds > 0 else 0.0
+
+
+class RewardPerDialogueLengthMetric(UtilityMetricBase):
+    def __init__(
+        self,
+        user_nlu_config_path: str,
+        agent_nlu_config_path: str,
+        name: str = "reward_per_dialogue_length",
+    ):
+        super().__init__(user_nlu_config_path, agent_nlu_config_path, name)
+
+    def _assess_dialogue(
+        self, dialogue: Dialogue, acceptance_intents: List[Intent]
+    ) -> Tuple[int, int]:
+        """Returns accepted recommendations and dialogue length."""
+        nb_accepted_recommendations = sum(
+            1
+            for utterance in dialogue.utterances
+            if utterance.participant == DialogueParticipant.USER
+            and any(
+                intent in acceptance_intents
+                for intent in utterance.get_intents()
+            )
+        )
+        return nb_accepted_recommendations, len(dialogue.utterances)
+
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        dlg, _, acc, _ = self._prepare(dialogue, **kwargs)
+        nb_accepted_recommendations, dialogue_length = self._assess_dialogue(
+            dlg, acc
+        )
+        return nb_accepted_recommendations / dialogue_length

From 773822d8d3a4c7fe9ca6ea46c05c2edd18832ddc Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 10 Mar 2026 13:42:41 +0100
Subject: [PATCH 21/38] fixes

---
 tests/evaluation/test_utility_metric.py       |  36 ++-
 usersimcrs/evaluation/dialogue_annotation.py  | 211 ++++++++++++++
 usersimcrs/evaluation/quality_metric.py       |  33 ++-
 .../reward_per_dialogue_length_metric.py      |  87 ++++++
 usersimcrs/evaluation/satisfaction_metric.py  |  12 +-
 usersimcrs/evaluation/success_rate_metric.py  |  93 ++++++
 ...ssful_recommendation_round_ratio_metric.py |  97 +++++++
 usersimcrs/evaluation/utility_metric.py       | 273 ------------------
 8 files changed, 542 insertions(+), 300 deletions(-)
 create mode 100644 usersimcrs/evaluation/dialogue_annotation.py
 create mode 100644 usersimcrs/evaluation/reward_per_dialogue_length_metric.py
 create mode 100644 usersimcrs/evaluation/success_rate_metric.py
 create mode 100644 usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
 delete mode 100644 usersimcrs/evaluation/utility_metric.py

diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py
index 37eedb25..4862ba6c 100644
--- a/tests/evaluation/test_utility_metric.py
+++ b/tests/evaluation/test_utility_metric.py
@@ -1,17 +1,21 @@
 """Tests for utility metric classes."""
 
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 
 import pytest
 
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
 
-from usersimcrs.evaluation.utility_metric import (
+from usersimcrs.evaluation.reward_per_dialogue_length_metric import (
     RewardPerDialogueLengthMetric,
-    SuccessRateMetric,
+)
+from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric
+from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import (
     SuccessfulRecommendationRoundRatioMetric,
 )
 
+_MOCK_NLU = MagicMock()
+
 
 @pytest.fixture
 def dialogues():
@@ -53,8 +57,9 @@ def test_success_rate_evaluate_dialogue(
     """Test SuccessRateMetric.evaluate_dialogue."""
     dialogue = dialogues[0]
     with (
-        patch.object(
-            SuccessRateMetric, "_prepare", return_value=(dialogue, [], [], [])
+        patch(
+            "usersimcrs.evaluation.success_rate_metric.prepare_dialogue",
+            return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU),
         ),
         patch.object(SuccessRateMetric, "_assess_dialogue", return_value=1),
     ):
@@ -67,8 +72,9 @@ def test_success_rate_evaluate_dialogue_unsuccessful(
     """Test SuccessRateMetric.evaluate_dialogue for failed dialogue."""
     dialogue = dialogues[0]
     with (
-        patch.object(
-            SuccessRateMetric, "_prepare", return_value=(dialogue, [], [], [])
+        patch(
+            "usersimcrs.evaluation.success_rate_metric.prepare_dialogue",
+            return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU),
         ),
         patch.object(SuccessRateMetric, "_assess_dialogue", return_value=0),
     ):
@@ -82,10 +88,10 @@ def test_successful_recommendation_round_ratio_evaluate_dialogue(
     """Test SuccessfulRecommendationRoundRatioMetric.evaluate_dialogue."""
     dialogue = dialogues[0]
     with (
-        patch.object(
-            SuccessfulRecommendationRoundRatioMetric,
-            "_prepare",
-            return_value=(dialogue, [], [], []),
+        patch(
+            "usersimcrs.evaluation.successful_recommendation_round_ratio_metric"
+            ".prepare_dialogue",
+            return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU),
         ),
         patch.object(
             SuccessfulRecommendationRoundRatioMetric,
@@ -102,10 +108,10 @@ def test_reward_per_dialogue_length_evaluate_dialogue(
     """Test RewardPerDialogueLengthMetric.evaluate_dialogue."""
     dialogue = dialogues[0]
     with (
-        patch.object(
-            RewardPerDialogueLengthMetric,
-            "_prepare",
-            return_value=(dialogue, [], [], []),
+        patch(
+            "usersimcrs.evaluation.reward_per_dialogue_length_metric"
+            ".prepare_dialogue",
+            return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU),
         ),
         patch.object(
             RewardPerDialogueLengthMetric,
diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py
new file mode 100644
index 00000000..06bc2572
--- /dev/null
+++ b/usersimcrs/evaluation/dialogue_annotation.py
@@ -0,0 +1,211 @@
+"""Dialogue annotation and recommendation round utilities.
+
+Provides functions for annotating dialogues with dialogue acts using NLU
+modules, parsing intent labels, and extracting recommendation rounds from
+annotated dialogues.
+"""
+
+from typing import Any, List, Optional, Tuple
+
+from confuse import Configuration
+
+from dialoguekit.core.annotated_utterance import AnnotatedUtterance
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
+from dialoguekit.nlu.nlu import NLU
+from dialoguekit.participant.participant import DialogueParticipant
+
+from usersimcrs.utils.simulation_utils import get_NLU
+
+
+def annotate_dialogue(
+    dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
+) -> Dialogue:
+    """Annotates utterances with dialogue acts.
+
+    Each utterance that is not already an AnnotatedUtterance is converted to
+    one.  Utterances that already carry dialogue acts are left untouched.
+
+    Args:
+        dialogue: Dialogue to be annotated.
+        user_nlu: NLU module for user utterances.
+        agent_nlu: NLU module for agent utterances.
+
+    Raises:
+        ValueError: If an utterance has an unknown participant.
+
+    Returns:
+        The same dialogue object with annotated utterances.
+    """
+    for i, utterance in enumerate(dialogue.utterances):
+        if not isinstance(utterance, AnnotatedUtterance):
+            dialogue.utterances[i] = AnnotatedUtterance.from_utterance(
+                utterance
+            )
+
+        if len(utterance.dialogue_acts) > 0:
+            continue
+
+        if utterance.participant == DialogueParticipant.USER:
+            dialogue.utterances[
+                i
+            ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance)
+        elif utterance.participant == DialogueParticipant.AGENT:
+            dialogue.utterances[
+                i
+            ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance)
+        else:
+            raise ValueError(f"Unknown participant: {utterance.participant}")
+    return dialogue
+
+
+def load_nlus(
+    user_nlu_config_path: str,
+    agent_nlu_config_path: str,
+    cached_user_nlu: Optional[NLU] = None,
+    cached_agent_nlu: Optional[NLU] = None,
+) -> Tuple[NLU, NLU]:
+    """Loads user and agent NLU modules.
+
+    Returns cached instances when provided, otherwise creates new ones
+    from the given configuration files.
+
+    Args:
+        user_nlu_config_path: Path to user NLU configuration file.
+        agent_nlu_config_path: Path to agent NLU configuration file.
+        cached_user_nlu: Previously loaded user NLU module.
+        cached_agent_nlu: Previously loaded agent NLU module.
+
+    Returns:
+        Tuple of (user_nlu, agent_nlu) modules.
+    """
+    if cached_user_nlu is None:
+        user_nlu_config = Configuration("User NLU Configuration")
+        user_nlu_config.set_file(user_nlu_config_path)
+        cached_user_nlu = get_NLU(user_nlu_config)
+    if cached_agent_nlu is None:
+        agent_nlu_config = Configuration("Agent NLU Configuration")
+        agent_nlu_config.set_file(agent_nlu_config_path)
+        cached_agent_nlu = get_NLU(agent_nlu_config)
+    return cached_user_nlu, cached_agent_nlu
+
+
+def get_intent_lists(
+    **kwargs: Any,
+) -> Tuple[List[Intent], List[Intent], List[Intent]]:
+    """Builds recommendation, acceptance, and rejection intent lists.
+
+    Args:
+        **kwargs: Optional intent label overrides:
+            - recommendation_intent_labels: Labels for recommendation intents.
+              Defaults to ``["REC-S", "REC-E"]``.
+            - acceptance_intent_labels: Labels for acceptance intents.
+              Defaults to ``["ACC"]``.
+            - rejection_intent_labels: Labels for rejection intents.
+              Defaults to ``["REJ"]``.
+
+    Returns:
+        Tuple of (recommendation_intents, acceptance_intents,
+        rejection_intents).
+    """
+    rec_labels = kwargs.get("recommendation_intent_labels", ["REC-S", "REC-E"])
+    acc_labels = kwargs.get("acceptance_intent_labels", ["ACC"])
+    rej_labels = kwargs.get("rejection_intent_labels", ["REJ"])
+    return (
+        [Intent(label) for label in rec_labels],
+        [Intent(label) for label in acc_labels],
+        [Intent(label) for label in rej_labels],
+    )
+
+
+def get_recommendation_rounds(
+    dialogue: Dialogue, recommendation_intents: List[Intent]
+) -> List[List[AnnotatedUtterance]]:
+    """Splits a dialogue into recommendation rounds.
+
+    A new round begins each time an utterance contains a recommendation
+    intent.
+
+    Args:
+        dialogue: Annotated dialogue.
+        recommendation_intents: Intents that signal a recommendation.
+
+    Returns:
+        List of utterance groups, one per recommendation round.
+    """
+    rounds: List[List[AnnotatedUtterance]] = []
+    current_round: List[AnnotatedUtterance] = []
+    for utterance in dialogue.utterances:
+        if any(
+            intent in utterance.get_intents()
+            for intent in recommendation_intents
+        ):
+            if current_round:
+                rounds.append(current_round)
+            current_round = [utterance]
+        else:
+            current_round.append(utterance)
+    return rounds
+
+
+def prepare_dialogue(
+    dialogue: Dialogue,
+    user_nlu_config_path: str,
+    agent_nlu_config_path: str,
+    cached_user_nlu: Optional[NLU] = None,
+    cached_agent_nlu: Optional[NLU] = None,
+    **kwargs: Any,
+) -> Tuple[Dialogue, List[Intent], List[Intent], List[Intent], NLU, NLU]:
+    """Loads NLU modules, annotates a dialogue, and builds intent lists.
+
+    Combines :func:`load_nlus`, :func:`annotate_dialogue`, and
+    :func:`get_intent_lists` into a single convenience call.
+
+    Args:
+        dialogue: Dialogue to prepare.
+        user_nlu_config_path: Path to user NLU configuration file.
+        agent_nlu_config_path: Path to agent NLU configuration file.
+        cached_user_nlu: Previously loaded user NLU module (avoids reload).
+        cached_agent_nlu: Previously loaded agent NLU module (avoids reload).
+        **kwargs: Optional intent label overrides forwarded to
+            :func:`get_intent_lists`.
+
+    Returns:
+        Tuple of (annotated dialogue, recommendation intents,
+        acceptance intents, rejection intents, user NLU, agent NLU).
+    """
+    user_nlu, agent_nlu = load_nlus(
+        user_nlu_config_path,
+        agent_nlu_config_path,
+        cached_user_nlu,
+        cached_agent_nlu,
+    )
+    annotate_dialogue(dialogue, user_nlu, agent_nlu)
+    rec, acc, rej = get_intent_lists(**kwargs)
+    return dialogue, rec, acc, rej, user_nlu, agent_nlu
+
+
+def is_recommendation_accepted(
+    round_utterances: List[AnnotatedUtterance],
+    acceptance_intents: List[Intent],
+    rejection_intents: List[Intent],
+) -> bool:
+    """Assesses whether a recommendation round was accepted.
+
+    Args:
+        round_utterances: Utterances in the recommendation round.
+        acceptance_intents: Intents corresponding to acceptance.
+        rejection_intents: Intents corresponding to rejection.
+
+    Returns:
+        True if the recommendation was accepted, False otherwise.
+    """
+    b_accepted = False
+    for utterance in round_utterances:
+        if utterance.participant == DialogueParticipant.USER:
+            intents = utterance.get_intents()
+            if any(intent in acceptance_intents for intent in intents):
+                b_accepted = True
+            elif any(intent in rejection_intents for intent in intents):
+                return False
+    return b_accepted
diff --git a/usersimcrs/evaluation/quality_metric.py b/usersimcrs/evaluation/quality_metric.py
index 04e7bf94..3198c77b 100644
--- a/usersimcrs/evaluation/quality_metric.py
+++ b/usersimcrs/evaluation/quality_metric.py
@@ -13,7 +13,7 @@
 
 import json
 import logging
-from typing import Any
+from typing import Any, Literal
 
 from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.participant.participant import DialogueParticipant
@@ -42,6 +42,12 @@ def __init__(
         llm_interface: LLMInterface,
         name: str = "quality",
     ) -> None:
+        """Initializes the quality metric.
+
+        Args:
+            llm_interface: LLM interface used for scoring.
+            name: Metric name.
+        """
         super().__init__(name)
         self.llm_interface = llm_interface
 
@@ -71,19 +77,28 @@ def _get_prompt(
         return prompt
 
     def evaluate_dialogue(
-        self, dialogue: Dialogue, aspect: str, **kwargs: Any
+        self,
+        dialogue: Dialogue,
+        aspect: Literal[
+            "REC_RELEVANCE",
+            "COM_STYLE",
+            "FLUENCY",
+            "CONV_FLOW",
+            "OVERALL_SAT",
+        ],
+        **kwargs: Any,
     ) -> float:
         """Returns score for a single aspect of a dialogue.
 
         Args:
             dialogue: Dialogue to evaluate.
-            aspect: Aspect to evaluate. Must be one of QualityRubrics enum names
-
-        Returns:
-            Score (1-5) for the specified aspect.
+            aspect: Aspect to evaluate. One of QualityRubrics enum names.
 
         Raises:
             KeyError: When the aspect does not exist in QualityRubrics.
+
+        Returns:
+            Score (1-5) for the specified aspect.
         """
         try:
             aspect_enum = QualityRubrics[aspect]
@@ -100,9 +115,7 @@ def evaluate_dialogue(
             return float(response_dict["score"])
         except Exception:
             logging.warning(
-                "Failed to parse LLM response for %s dialogue %s: %s",
-                aspect,
-                dialogue.conversation_id,
-                response,
+                f"Failed to parse LLM response for {aspect} dialogue "
+                f"{dialogue.conversation_id}: {response}",
             )
             return 0.0
diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
new file mode 100644
index 00000000..c094e7a7
--- /dev/null
+++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
@@ -0,0 +1,87 @@
+"""Reward-per-Dialogue-Length (RDL) metric implementation.
+
+Evaluates the ratio of accepted recommendations to total dialogue length.
+"""
+
+from typing import Any, List, Optional, Tuple
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
+from dialoguekit.nlu.nlu import NLU
+from dialoguekit.participant.participant import DialogueParticipant
+
+from usersimcrs.evaluation.base_metric import BaseMetric
+from usersimcrs.evaluation.dialogue_annotation import prepare_dialogue
+
+
+class RewardPerDialogueLengthMetric(BaseMetric):
+    """Measures accepted recommendations relative to dialogue length.
+
+    Returns the number of accepted recommendations divided by the total number
+    of utterances.
+    """
+
+    def __init__(
+        self,
+        user_nlu_config_path: str,
+        agent_nlu_config_path: str,
+        name: str = "reward_per_dialogue_length",
+    ) -> None:
+        """Initializes the reward-per-dialogue-length metric.
+
+        Args:
+            user_nlu_config_path: Path to user NLU configuration.
+            agent_nlu_config_path: Path to agent NLU configuration.
+            name: Metric name.
+        """
+        super().__init__(name)
+        self._user_nlu_config_path = user_nlu_config_path
+        self._agent_nlu_config_path = agent_nlu_config_path
+        self._user_nlu: Optional[NLU] = None
+        self._agent_nlu: Optional[NLU] = None
+
+    def _assess_dialogue(
+        self, dialogue: Dialogue, acceptance_intents: List[Intent]
+    ) -> Tuple[int, int]:
+        """Returns accepted recommendations and dialogue length.
+
+        Args:
+            dialogue: Annotated dialogue.
+            acceptance_intents: Intents that signal acceptance.
+
+        Returns:
+            Tuple of (accepted_recommendations, dialogue_length).
+        """
+        nb_accepted_recommendations = sum(
+            1
+            for utterance in dialogue.utterances
+            if utterance.participant == DialogueParticipant.USER
+            and any(
+                intent in acceptance_intents
+                for intent in utterance.get_intents()
+            )
+        )
+        return nb_accepted_recommendations, len(dialogue.utterances)
+
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the reward-per-dialogue-length score.
+
+        Args:
+            dialogue: Dialogue to evaluate.
+            **kwargs: Optional intent label overrides.
+
+        Returns:
+            Ratio of accepted recommendations to total utterances.
+        """
+        dlg, _, acc, _, self._user_nlu, self._agent_nlu = prepare_dialogue(
+            dialogue,
+            self._user_nlu_config_path,
+            self._agent_nlu_config_path,
+            self._user_nlu,
+            self._agent_nlu,
+            **kwargs,
+        )
+        nb_accepted_recommendations, dialogue_length = self._assess_dialogue(
+            dlg, acc
+        )
+        return nb_accepted_recommendations / dialogue_length
diff --git a/usersimcrs/evaluation/satisfaction_metric.py b/usersimcrs/evaluation/satisfaction_metric.py
index d05fdbbb..664f91c0 100644
--- a/usersimcrs/evaluation/satisfaction_metric.py
+++ b/usersimcrs/evaluation/satisfaction_metric.py
@@ -18,10 +18,18 @@ def __init__(
         self,
         classifier: SatisfactionClassifier,
         name: str = "satisfaction",
-    ):
+    ) -> None:
+        """Initializes the satisfaction metric.
+
+        Args:
+            classifier: Satisfaction classifier instance.
+            name: Metric name.
+        """
         super().__init__(name)
         self.classifier = classifier
 
     def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
         """Computes the satisfaction score for a single dialogue."""
-        return self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
+        return float(
+            self.classifier.classify_last_n_dialogue(dialogue, last_n=None)
+        )
diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py
new file mode 100644
index 00000000..3c689c71
--- /dev/null
+++ b/usersimcrs/evaluation/success_rate_metric.py
@@ -0,0 +1,93 @@
+"""Success Rate (SR) metric implementation.
+
+Evaluates whether at least one recommendation was accepted during a dialogue.
+"""
+
+from typing import Any, List, Optional
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
+from dialoguekit.nlu.nlu import NLU
+
+from usersimcrs.evaluation.base_metric import BaseMetric
+from usersimcrs.evaluation.dialogue_annotation import (
+    get_recommendation_rounds,
+    is_recommendation_accepted,
+    prepare_dialogue,
+)
+
+
+class SuccessRateMetric(BaseMetric):
+    """Measures whether a dialogue contains at least one accepted
+    recommendation.
+
+    Returns 1.0 if at least one recommendation round was successful,
+    0.0 otherwise.
+    """
+
+    def __init__(
+        self,
+        user_nlu_config_path: str,
+        agent_nlu_config_path: str,
+        name: str = "success_rate",
+    ) -> None:
+        """Initializes the success rate metric.
+
+        Args:
+            user_nlu_config_path: Path to user NLU configuration.
+            agent_nlu_config_path: Path to agent NLU configuration.
+            name: Metric name.
+        """
+        super().__init__(name)
+        self._user_nlu_config_path = user_nlu_config_path
+        self._agent_nlu_config_path = agent_nlu_config_path
+        self._user_nlu: Optional[NLU] = None
+        self._agent_nlu: Optional[NLU] = None
+
+    def _assess_dialogue(
+        self,
+        dialogue: Dialogue,
+        recommendation_intents: List[Intent],
+        acceptance_intents: List[Intent],
+        rejection_intents: List[Intent],
+    ) -> int:
+        """Returns number of successful recommendation rounds.
+
+        Args:
+            dialogue: Annotated dialogue.
+            recommendation_intents: Intents that signal a recommendation.
+            acceptance_intents: Intents that signal acceptance.
+            rejection_intents: Intents that signal rejection.
+
+        Returns:
+            Number of recommendation rounds that were accepted.
+        """
+        rounds = get_recommendation_rounds(dialogue, recommendation_intents)
+        return sum(
+            1
+            for round_utterances in rounds
+            if is_recommendation_accepted(
+                round_utterances, acceptance_intents, rejection_intents
+            )
+        )
+
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the success rate for a single dialogue.
+
+        Args:
+            dialogue: Dialogue to evaluate.
+            **kwargs: Optional intent label overrides.
+
+        Returns:
+            1.0 if at least one recommendation was accepted, 0.0 otherwise.
+        """
+        dlg, rec, acc, rej, self._user_nlu, self._agent_nlu = prepare_dialogue(
+            dialogue,
+            self._user_nlu_config_path,
+            self._agent_nlu_config_path,
+            self._user_nlu,
+            self._agent_nlu,
+            **kwargs,
+        )
+        successful_rounds = self._assess_dialogue(dlg, rec, acc, rej)
+        return float(successful_rounds > 0)
diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
new file mode 100644
index 00000000..b8de7013
--- /dev/null
+++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
@@ -0,0 +1,97 @@
+"""Successful Recommendation Round Ratio (SRRR) metric implementation.
+
+Evaluates the ratio of accepted recommendation rounds to total recommendation
+rounds in a dialogue.
+"""
+
+from typing import Any, List, Optional, Tuple
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
+from dialoguekit.nlu.nlu import NLU
+
+from usersimcrs.evaluation.base_metric import BaseMetric
+from usersimcrs.evaluation.dialogue_annotation import (
+    get_recommendation_rounds,
+    is_recommendation_accepted,
+    prepare_dialogue,
+)
+
+
+class SuccessfulRecommendationRoundRatioMetric(BaseMetric):
+    """Measures the fraction of recommendation rounds that were accepted.
+
+    Returns a value between 0.0 and 1.0 (or 0.0 when there are no recommendation
+    rounds).
+    """
+
+    def __init__(
+        self,
+        user_nlu_config_path: str,
+        agent_nlu_config_path: str,
+        name: str = "successful_recommendation_round_ratio",
+    ) -> None:
+        """Initializes the successful recommendation round ratio metric.
+
+        Args:
+            user_nlu_config_path: Path to user NLU configuration.
+            agent_nlu_config_path: Path to agent NLU configuration.
+            name: Metric name.
+        """
+        super().__init__(name)
+        self._user_nlu_config_path = user_nlu_config_path
+        self._agent_nlu_config_path = agent_nlu_config_path
+        self._user_nlu: Optional[NLU] = None
+        self._agent_nlu: Optional[NLU] = None
+
+    def _assess_dialogue(
+        self,
+        dialogue: Dialogue,
+        recommendation_intents: List[Intent],
+        acceptance_intents: List[Intent],
+        rejection_intents: List[Intent],
+    ) -> Tuple[int, int]:
+        """Returns successful and total recommendation rounds.
+
+        Args:
+            dialogue: Annotated dialogue.
+            recommendation_intents: Intents that signal a recommendation.
+            acceptance_intents: Intents that signal acceptance.
+            rejection_intents: Intents that signal rejection.
+
+        Returns:
+            Tuple of (successful_rounds, total_rounds).
+        """
+        rounds = get_recommendation_rounds(dialogue, recommendation_intents)
+        successful_rounds = sum(
+            1
+            for round_utterances in rounds
+            if is_recommendation_accepted(
+                round_utterances, acceptance_intents, rejection_intents
+            )
+        )
+        return successful_rounds, len(rounds)
+
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the successful recommendation round ratio.
+
+        Args:
+            dialogue: Dialogue to evaluate.
+            **kwargs: Optional intent label overrides.
+
+        Returns:
+            Ratio of accepted recommendation rounds to total rounds,
+            or 0.0 if there are no recommendation rounds.
+        """
+        dlg, rec, acc, rej, self._user_nlu, self._agent_nlu = prepare_dialogue(
+            dialogue,
+            self._user_nlu_config_path,
+            self._agent_nlu_config_path,
+            self._user_nlu,
+            self._agent_nlu,
+            **kwargs,
+        )
+        successful_rounds, total_rounds = self._assess_dialogue(
+            dlg, rec, acc, rej
+        )
+        return successful_rounds / total_rounds if total_rounds > 0 else 0.0
diff --git a/usersimcrs/evaluation/utility_metric.py b/usersimcrs/evaluation/utility_metric.py
deleted file mode 100644
index c9932d50..00000000
--- a/usersimcrs/evaluation/utility_metric.py
+++ /dev/null
@@ -1,273 +0,0 @@
-"""Utility metric class implementation.
-
-Computes three  utility metrics:
-
-- Success Rate (SR)
-- Successful Recommendation Round Ratio (SRRR)
-- Reward-per-Dialogue-Length (RDL)
-"""
-
-from typing import Any, List, Optional, Tuple
-
-from confuse import Configuration
-
-from dialoguekit.core.annotated_utterance import AnnotatedUtterance
-from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.core.intent import Intent
-from dialoguekit.nlu.nlu import NLU
-from dialoguekit.participant.participant import DialogueParticipant
-
-from usersimcrs.evaluation.base_metric import BaseMetric
-from usersimcrs.utils.simulation_utils import get_NLU
-
-
-class UtilityMetricBase(BaseMetric):
-    def __init__(
-        self,
-        user_nlu_config_path: str,
-        agent_nlu_config_path: str,
-        name: str,
-    ):
-        super().__init__(name)
-        self.user_nlu_config_path = user_nlu_config_path
-        self.agent_nlu_config_path = agent_nlu_config_path
-        self._user_nlu: Optional[NLU] = None
-        self._agent_nlu: Optional[NLU] = None
-
-    def _annotate_dialogue(
-        self, dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
-    ) -> Dialogue:
-        """Annotates utterances with dialogue acts.
-
-        Args:
-            dialogue: Dialogue to be annotated.
-            user_nlu: User NLU module.
-            agent_nlu: Agent NLU module.
-
-        Returns:
-            Annotated dialogue.
-        """
-        for i, utterance in enumerate(dialogue.utterances):
-            if not isinstance(utterance, AnnotatedUtterance):
-                dialogue.utterances[i] = AnnotatedUtterance.from_utterance(
-                    utterance
-                )
-
-            if len(utterance.dialogue_acts) > 0:
-                continue
-
-            if utterance.participant == DialogueParticipant.USER:
-                dialogue.utterances[
-                    i
-                ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance)
-            elif utterance.participant == DialogueParticipant.AGENT:
-                dialogue.utterances[
-                    i
-                ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance)
-            else:
-                raise ValueError(
-                    f"Unknown participant: {utterance.participant}"
-                )
-        return dialogue
-
-    def _get_recommendation_rounds(
-        self, dialogue: Dialogue, recommendation_intents: List[Intent]
-    ) -> List[List[AnnotatedUtterance]]:
-        """Gets utterances per recommendation round.
-
-        Args:
-            dialogue: Dialogue.
-            recommendation_intents: Intents corresponding to recommendation.
-
-        Returns:
-            Utterances per recommendation round.
-        """
-        rounds: List[List[AnnotatedUtterance]] = []
-        current_round: List[AnnotatedUtterance] = []
-        for utterance in dialogue.utterances:
-            if any(
-                intent in utterance.get_intents()
-                for intent in recommendation_intents
-            ):
-                if current_round:
-                    rounds.append(current_round)
-                current_round = [utterance]
-            else:
-                current_round.append(utterance)
-        return rounds
-
-    def _is_recommendation_accepted(
-        self,
-        round: List[AnnotatedUtterance],
-        acceptance_intents: List[Intent],
-        rejection_intents: List[Intent],
-    ) -> bool:
-        """Assesses whether the recommendation was accepted.
-
-        Args:
-            round: Utterances in recommendation round.
-            acceptance_intents: Intents corresponding to acceptance.
-            rejection_intents: Intents corresponding to rejection.
-
-        Returns:
-            True if the recommendation was accepted, False otherwise.
-        """
-        b_accepted = False
-        for utterance in round:
-            if utterance.participant == DialogueParticipant.USER:
-                intents = utterance.get_intents()
-                if any(intent in acceptance_intents for intent in intents):
-                    b_accepted = True
-                elif any(intent in rejection_intents for intent in intents):
-                    return False
-        return b_accepted
-
-    def _load_nlus(self) -> Tuple[NLU, NLU]:
-        """Returns (cached) user and agent NLU modules."""
-        if self._user_nlu is None:
-            # NLU module for user utterances
-            user_nlu_config = Configuration("User NLU Configuration")
-            user_nlu_config.set_file(self.user_nlu_config_path)
-            self._user_nlu = get_NLU(user_nlu_config)
-        if self._agent_nlu is None:
-            # NLU module for agent utterances
-            agent_nlu_config = Configuration("Agent NLU Configuration")
-            agent_nlu_config.set_file(self.agent_nlu_config_path)
-            self._agent_nlu = get_NLU(agent_nlu_config)
-        return self._user_nlu, self._agent_nlu
-
-    def _get_intent_lists(self, **kwargs: Any) -> Tuple[List[Intent], ...]:
-        """Builds intent lists from kwargs."""
-        rec_labels = kwargs.get(
-            "recommendation_intent_labels", ["REC-S", "REC-E"]
-        )
-        acc_labels = kwargs.get("acceptance_intent_labels", ["ACC"])
-        rej_labels = kwargs.get("rejection_intent_labels", ["REJ"])
-        return (
-            [Intent(label) for label in rec_labels],
-            [Intent(label) for label in acc_labels],
-            [Intent(label) for label in rej_labels],
-        )
-
-    def _prepare(
-        self, dialogue: Dialogue, **kwargs: Any
-    ) -> Tuple[Dialogue, List[Intent], List[Intent], List[Intent]]:
-        """Annotates dialogue.
-
-        Returns:
-            dialogue
-            rec_intents: Recommendation intents.
-            acc_intents: Acceptance intents.
-            rej_intents: Rejection intents.
-        """
-        user_nlu, agent_nlu = self._load_nlus()
-        self._annotate_dialogue(dialogue, user_nlu, agent_nlu)
-        rec, acc, rej = self._get_intent_lists(**kwargs)
-        return dialogue, rec, acc, rej
-
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
-        """Computes the metric for a single dialogue."""
-        raise NotImplementedError()
-
-
-class SuccessRateMetric(UtilityMetricBase):
-    def __init__(
-        self,
-        user_nlu_config_path: str,
-        agent_nlu_config_path: str,
-        name: str = "success_rate",
-    ):
-        super().__init__(user_nlu_config_path, agent_nlu_config_path, name)
-
-    def _assess_dialogue(
-        self,
-        dialogue: Dialogue,
-        recommendation_intents: List[Intent],
-        acceptance_intents: List[Intent],
-        rejection_intents: List[Intent],
-    ) -> int:
-        """Returns number of successful recommendation rounds."""
-        rounds = self._get_recommendation_rounds(
-            dialogue, recommendation_intents
-        )
-        return sum(
-            1
-            for round_utterances in rounds
-            if self._is_recommendation_accepted(
-                round_utterances, acceptance_intents, rejection_intents
-            )
-        )
-
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
-        dlg, rec, acc, rej = self._prepare(dialogue, **kwargs)
-        successful_rounds = self._assess_dialogue(dlg, rec, acc, rej)
-        return float(successful_rounds > 0)
-
-
-class SuccessfulRecommendationRoundRatioMetric(UtilityMetricBase):
-    def __init__(
-        self,
-        user_nlu_config_path: str,
-        agent_nlu_config_path: str,
-        name: str = "successful_recommendation_round_ratio",
-    ):
-        super().__init__(user_nlu_config_path, agent_nlu_config_path, name)
-
-    def _assess_dialogue(
-        self,
-        dialogue: Dialogue,
-        recommendation_intents: List[Intent],
-        acceptance_intents: List[Intent],
-        rejection_intents: List[Intent],
-    ) -> Tuple[int, int]:
-        """Returns successful rounds and total rounds."""
-        rounds = self._get_recommendation_rounds(
-            dialogue, recommendation_intents
-        )
-        successful_rounds = sum(
-            1
-            for round_utterances in rounds
-            if self._is_recommendation_accepted(
-                round_utterances, acceptance_intents, rejection_intents
-            )
-        )
-        return successful_rounds, len(rounds)
-
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
-        dlg, rec, acc, rej = self._prepare(dialogue, **kwargs)
-        successful_rounds, total_rounds = self._assess_dialogue(
-            dlg, rec, acc, rej
-        )
-        return successful_rounds / total_rounds if total_rounds > 0 else 0.0
-
-
-class RewardPerDialogueLengthMetric(UtilityMetricBase):
-    def __init__(
-        self,
-        user_nlu_config_path: str,
-        agent_nlu_config_path: str,
-        name: str = "reward_per_dialogue_length",
-    ):
-        super().__init__(user_nlu_config_path, agent_nlu_config_path, name)
-
-    def _assess_dialogue(
-        self, dialogue: Dialogue, acceptance_intents: List[Intent]
-    ) -> Tuple[int, int]:
-        """Returns accepted recommendations and dialogue length."""
-        nb_accepted_recommendations = sum(
-            1
-            for utterance in dialogue.utterances
-            if utterance.participant == DialogueParticipant.USER
-            and any(
-                intent in acceptance_intents
-                for intent in utterance.get_intents()
-            )
-        )
-        return nb_accepted_recommendations, len(dialogue.utterances)
-
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
-        dlg, _, acc, _ = self._prepare(dialogue, **kwargs)
-        nb_accepted_recommendations, dialogue_length = self._assess_dialogue(
-            dlg, acc
-        )
-        return nb_accepted_recommendations / dialogue_length

From 5e1b3a6c4d25e0530f5cc13b44d10bcf63efbed6 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 10 Mar 2026 13:47:38 +0100
Subject: [PATCH 22/38] fixes

---
 .../evaluation/reward_per_dialogue_length_metric.py      | 8 +-------
 usersimcrs/evaluation/success_rate_metric.py             | 9 +--------
 .../successful_recommendation_round_ratio_metric.py      | 8 +-------
 3 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
index c094e7a7..463a5508 100644
--- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
+++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
@@ -1,4 +1,4 @@
-"""Reward-per-Dialogue-Length (RDL) metric implementation.
+"""Reward-per-Dialogue-Length metric implementation.
 
 Evaluates the ratio of accepted recommendations to total dialogue length.
 """
@@ -15,12 +15,6 @@
 
 
 class RewardPerDialogueLengthMetric(BaseMetric):
-    """Measures accepted recommendations relative to dialogue length.
-
-    Returns the number of accepted recommendations divided by the total number
-    of utterances.
-    """
-
     def __init__(
         self,
         user_nlu_config_path: str,
diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py
index 3c689c71..478a7afa 100644
--- a/usersimcrs/evaluation/success_rate_metric.py
+++ b/usersimcrs/evaluation/success_rate_metric.py
@@ -1,4 +1,4 @@
-"""Success Rate (SR) metric implementation.
+"""Success Rate metric implementation.
 
 Evaluates whether at least one recommendation was accepted during a dialogue.
 """
@@ -18,13 +18,6 @@
 
 
 class SuccessRateMetric(BaseMetric):
-    """Measures whether a dialogue contains at least one accepted
-    recommendation.
-
-    Returns 1.0 if at least one recommendation round was successful,
-    0.0 otherwise.
-    """
-
     def __init__(
         self,
         user_nlu_config_path: str,
diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
index b8de7013..9d2156c0 100644
--- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
+++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
@@ -1,4 +1,4 @@
-"""Successful Recommendation Round Ratio (SRRR) metric implementation.
+"""Successful Recommendation Round Ratio metric implementation.
 
 Evaluates the ratio of accepted recommendation rounds to total recommendation
 rounds in a dialogue.
@@ -19,12 +19,6 @@
 
 
 class SuccessfulRecommendationRoundRatioMetric(BaseMetric):
-    """Measures the fraction of recommendation rounds that were accepted.
-
-    Returns a value between 0.0 and 1.0 (or 0.0 when there are no recommendation
-    rounds).
-    """
-
     def __init__(
         self,
         user_nlu_config_path: str,

From 558b5885907908b8957df473d38e944f17583075 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 10 Mar 2026 15:09:58 +0100
Subject: [PATCH 23/38] made fixes

---
 tests/evaluation/test_utility_metric.py       |  4 +-
 usersimcrs/evaluation/dialogue_annotation.py  | 21 ++++++++
 .../reward_per_dialogue_length_metric.py      | 46 ++++++++++------
 usersimcrs/evaluation/success_rate_metric.py  | 52 ++++++++++++-------
 ...ssful_recommendation_round_ratio_metric.py | 38 ++++++++++----
 5 files changed, 114 insertions(+), 47 deletions(-)

diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py
index 4862ba6c..e15f7bc3 100644
--- a/tests/evaluation/test_utility_metric.py
+++ b/tests/evaluation/test_utility_metric.py
@@ -61,7 +61,7 @@ def test_success_rate_evaluate_dialogue(
             "usersimcrs.evaluation.success_rate_metric.prepare_dialogue",
             return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU),
         ),
-        patch.object(SuccessRateMetric, "_assess_dialogue", return_value=1),
+        patch.object(SuccessRateMetric, "_assess_dialogue", return_value=True),
     ):
         assert success_rate_metric.evaluate_dialogue(dialogue) == 1.0
 
@@ -76,7 +76,7 @@ def test_success_rate_evaluate_dialogue_unsuccessful(
             "usersimcrs.evaluation.success_rate_metric.prepare_dialogue",
             return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU),
         ),
-        patch.object(SuccessRateMetric, "_assess_dialogue", return_value=0),
+        patch.object(SuccessRateMetric, "_assess_dialogue", return_value=False),
     ):
         assert success_rate_metric.evaluate_dialogue(dialogue) == 0.0
 
diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py
index 06bc2572..e431beab 100644
--- a/usersimcrs/evaluation/dialogue_annotation.py
+++ b/usersimcrs/evaluation/dialogue_annotation.py
@@ -118,6 +118,27 @@ def get_intent_lists(
     )
 
 
+def annotate_dialogues(
+    dialogues: List[Dialogue],
+    user_nlu_config_path: str,
+    agent_nlu_config_path: str,
+) -> List[Dialogue]:
+    """Annotates a batch of dialogues, loading NLU modules once.
+
+    Args:
+        dialogues: Dialogues to annotate.
+        user_nlu_config_path: Path to user NLU configuration file.
+        agent_nlu_config_path: Path to agent NLU configuration file.
+
+    Returns:
+        The same dialogue objects with annotated utterances.
+    """
+    user_nlu, agent_nlu = load_nlus(user_nlu_config_path, agent_nlu_config_path)
+    for dialogue in dialogues:
+        annotate_dialogue(dialogue, user_nlu, agent_nlu)
+    return dialogues
+
+
 def get_recommendation_rounds(
     dialogue: Dialogue, recommendation_intents: List[Intent]
 ) -> List[List[AnnotatedUtterance]]:
diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
index 463a5508..5d6eab26 100644
--- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
+++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
@@ -11,18 +11,25 @@
 from dialoguekit.participant.participant import DialogueParticipant
 
 from usersimcrs.evaluation.base_metric import BaseMetric
-from usersimcrs.evaluation.dialogue_annotation import prepare_dialogue
+from usersimcrs.evaluation.dialogue_annotation import (
+    get_intent_lists,
+    prepare_dialogue,
+)
 
 
 class RewardPerDialogueLengthMetric(BaseMetric):
     def __init__(
         self,
-        user_nlu_config_path: str,
-        agent_nlu_config_path: str,
+        user_nlu_config_path: Optional[str] = None,
+        agent_nlu_config_path: Optional[str] = None,
         name: str = "reward_per_dialogue_length",
     ) -> None:
         """Initializes the reward-per-dialogue-length metric.
 
+        When NLU config paths are provided, dialogues are annotated
+        automatically. When omitted, dialogues must be pre-annotated
+        (e.g., via :func:`annotate_dialogues`).
+
         Args:
             user_nlu_config_path: Path to user NLU configuration.
             agent_nlu_config_path: Path to agent NLU configuration.
@@ -67,15 +74,24 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
         Returns:
             Ratio of accepted recommendations to total utterances.
         """
-        dlg, _, acc, _, self._user_nlu, self._agent_nlu = prepare_dialogue(
-            dialogue,
-            self._user_nlu_config_path,
-            self._agent_nlu_config_path,
-            self._user_nlu,
-            self._agent_nlu,
-            **kwargs,
-        )
-        nb_accepted_recommendations, dialogue_length = self._assess_dialogue(
-            dlg, acc
-        )
-        return nb_accepted_recommendations / dialogue_length
+        if self._user_nlu_config_path and self._agent_nlu_config_path:
+            (
+                dialogue,
+                _,
+                acc,
+                _,
+                self._user_nlu,
+                self._agent_nlu,
+            ) = prepare_dialogue(
+                dialogue,
+                self._user_nlu_config_path,
+                self._agent_nlu_config_path,
+                self._user_nlu,
+                self._agent_nlu,
+                **kwargs,
+            )
+        else:
+            _, acc, _ = get_intent_lists(**kwargs)
+
+        nb_accepted, dialogue_length = self._assess_dialogue(dialogue, acc)
+        return nb_accepted / dialogue_length
diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py
index 478a7afa..cc9c192c 100644
--- a/usersimcrs/evaluation/success_rate_metric.py
+++ b/usersimcrs/evaluation/success_rate_metric.py
@@ -11,6 +11,7 @@
 
 from usersimcrs.evaluation.base_metric import BaseMetric
 from usersimcrs.evaluation.dialogue_annotation import (
+    get_intent_lists,
     get_recommendation_rounds,
     is_recommendation_accepted,
     prepare_dialogue,
@@ -20,12 +21,16 @@
 class SuccessRateMetric(BaseMetric):
     def __init__(
         self,
-        user_nlu_config_path: str,
-        agent_nlu_config_path: str,
+        user_nlu_config_path: Optional[str] = None,
+        agent_nlu_config_path: Optional[str] = None,
         name: str = "success_rate",
     ) -> None:
         """Initializes the success rate metric.
 
+        When NLU config paths are provided, dialogues are annotated
+        automatically. When omitted, dialogues must be pre-annotated
+        (e.g., via :func:`annotate_dialogues`).
+
         Args:
             user_nlu_config_path: Path to user NLU configuration.
             agent_nlu_config_path: Path to agent NLU configuration.
@@ -43,8 +48,8 @@ def _assess_dialogue(
         recommendation_intents: List[Intent],
         acceptance_intents: List[Intent],
         rejection_intents: List[Intent],
-    ) -> int:
-        """Returns number of successful recommendation rounds.
+    ) -> bool:
+        """Checks whether at least one recommendation round was accepted.
 
         Args:
             dialogue: Annotated dialogue.
@@ -53,15 +58,14 @@ def _assess_dialogue(
             rejection_intents: Intents that signal rejection.
 
         Returns:
-            Number of recommendation rounds that were accepted.
+            True if at least one round was accepted, False otherwise.
         """
         rounds = get_recommendation_rounds(dialogue, recommendation_intents)
-        return sum(
-            1
-            for round_utterances in rounds
-            if is_recommendation_accepted(
+        return any(
+            is_recommendation_accepted(
                 round_utterances, acceptance_intents, rejection_intents
             )
+            for round_utterances in rounds
         )
 
     def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
@@ -74,13 +78,23 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
         Returns:
             1.0 if at least one recommendation was accepted, 0.0 otherwise.
         """
-        dlg, rec, acc, rej, self._user_nlu, self._agent_nlu = prepare_dialogue(
-            dialogue,
-            self._user_nlu_config_path,
-            self._agent_nlu_config_path,
-            self._user_nlu,
-            self._agent_nlu,
-            **kwargs,
-        )
-        successful_rounds = self._assess_dialogue(dlg, rec, acc, rej)
-        return float(successful_rounds > 0)
+        if self._user_nlu_config_path and self._agent_nlu_config_path:
+            (
+                dialogue,
+                rec,
+                acc,
+                rej,
+                self._user_nlu,
+                self._agent_nlu,
+            ) = prepare_dialogue(
+                dialogue,
+                self._user_nlu_config_path,
+                self._agent_nlu_config_path,
+                self._user_nlu,
+                self._agent_nlu,
+                **kwargs,
+            )
+        else:
+            rec, acc, rej = get_intent_lists(**kwargs)
+
+        return float(self._assess_dialogue(dialogue, rec, acc, rej))
diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
index 9d2156c0..00fa5c0b 100644
--- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
+++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
@@ -12,6 +12,7 @@
 
 from usersimcrs.evaluation.base_metric import BaseMetric
 from usersimcrs.evaluation.dialogue_annotation import (
+    get_intent_lists,
     get_recommendation_rounds,
     is_recommendation_accepted,
     prepare_dialogue,
@@ -21,12 +22,16 @@
 class SuccessfulRecommendationRoundRatioMetric(BaseMetric):
     def __init__(
         self,
-        user_nlu_config_path: str,
-        agent_nlu_config_path: str,
+        user_nlu_config_path: Optional[str] = None,
+        agent_nlu_config_path: Optional[str] = None,
         name: str = "successful_recommendation_round_ratio",
     ) -> None:
         """Initializes the successful recommendation round ratio metric.
 
+        When NLU config paths are provided, dialogues are annotated
+        automatically. When omitted, dialogues must be pre-annotated
+        (e.g., via :func:`annotate_dialogues`).
+
         Args:
             user_nlu_config_path: Path to user NLU configuration.
             agent_nlu_config_path: Path to agent NLU configuration.
@@ -77,15 +82,26 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
             Ratio of accepted recommendation rounds to total rounds,
             or 0.0 if there are no recommendation rounds.
         """
-        dlg, rec, acc, rej, self._user_nlu, self._agent_nlu = prepare_dialogue(
-            dialogue,
-            self._user_nlu_config_path,
-            self._agent_nlu_config_path,
-            self._user_nlu,
-            self._agent_nlu,
-            **kwargs,
-        )
+        if self._user_nlu_config_path and self._agent_nlu_config_path:
+            (
+                dialogue,
+                rec,
+                acc,
+                rej,
+                self._user_nlu,
+                self._agent_nlu,
+            ) = prepare_dialogue(
+                dialogue,
+                self._user_nlu_config_path,
+                self._agent_nlu_config_path,
+                self._user_nlu,
+                self._agent_nlu,
+                **kwargs,
+            )
+        else:
+            rec, acc, rej = get_intent_lists(**kwargs)
+
         successful_rounds, total_rounds = self._assess_dialogue(
-            dlg, rec, acc, rej
+            dialogue, rec, acc, rej
         )
         return successful_rounds / total_rounds if total_rounds > 0 else 0.0

From 9146421211be4931af36588230e55c38c644051d Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 10 Mar 2026 15:57:18 +0100
Subject: [PATCH 24/38] remove class from other pr

---
 usersimcrs/evaluation/__init__.py    |  5 ---
 usersimcrs/evaluation/base_metric.py | 48 ----------------------------
 2 files changed, 53 deletions(-)
 delete mode 100644 usersimcrs/evaluation/__init__.py
 delete mode 100644 usersimcrs/evaluation/base_metric.py

diff --git a/usersimcrs/evaluation/__init__.py b/usersimcrs/evaluation/__init__.py
deleted file mode 100644
index c55a4339..00000000
--- a/usersimcrs/evaluation/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Evaluation metrics for dialogue systems."""
-
-from usersimcrs.evaluation.base_metric import BaseMetric
-
-__all__ = ["BaseMetric"]
diff --git a/usersimcrs/evaluation/base_metric.py b/usersimcrs/evaluation/base_metric.py
deleted file mode 100644
index c99399a2..00000000
--- a/usersimcrs/evaluation/base_metric.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""Abstract base class for dialogue evaluation metrics."""
-
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List
-from dialoguekit.core.dialogue import Dialogue
-
-
-class BaseMetric(ABC):
-    def __init__(self, name: str) -> None:
-        """Initializes the metric.
-
-        Args:
-            name: Metric name.
-        """
-        self.name = name
-
-    @abstractmethod
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
-        """Computes the metric for a single dialogue.
-
-        Args:
-            dialogue: Single dialogue to score.
-            **kwargs: Additional arguments specific to the metric.
-
-        Raises:
-            NotImplementedError: When not implemented by a subclass.
-
-        Returns:
-            Score for the dialogue.
-        """
-        raise NotImplementedError()
-
-    def evaluate_dialogues(
-        self, dialogues: List[Dialogue], **kwargs: Any
-    ) -> Dict[str, float]:
-        """Computes the metric for every dialogue in a given list.
-
-        Args:
-            dialogues: Dialogues.
-            **kwargs: Additional arguments specific to the metric.
-
-        Returns:
-            Dictionary with result per dialogue. Keys are conversation IDs.
-        """
-        return {
-            dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs)
-            for dialogue in dialogues
-        }

From fe8ea9c30343a29c55558df6d62c13619b2bd288 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 10 Mar 2026 16:23:33 +0100
Subject: [PATCH 25/38] changes after new structure

---
 usersimcrs/evaluation/main.py | 58 +++++++++++++++++------------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/usersimcrs/evaluation/main.py b/usersimcrs/evaluation/main.py
index 3696b1f9..fa1cf048 100644
--- a/usersimcrs/evaluation/main.py
+++ b/usersimcrs/evaluation/main.py
@@ -5,7 +5,7 @@
 import os
 from collections import defaultdict
 from statistics import mean, stdev
-from typing import Dict, List, Mapping, Sequence
+from typing import Any, Dict, List, Mapping, Sequence
 
 from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.nlu.models.satisfaction_classifier import (
@@ -14,16 +14,25 @@
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
 
 from usersimcrs.evaluation.base_metric import BaseMetric
+from usersimcrs.evaluation.dialogue_annotation import annotate_dialogues
 from usersimcrs.evaluation.quality_metric import QualityMetric
 from usersimcrs.evaluation.quality_rubrics import QualityRubrics
 from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric
-from usersimcrs.evaluation.utility_metric import (
+from usersimcrs.evaluation.reward_per_dialogue_length_metric import (
     RewardPerDialogueLengthMetric,
-    SuccessRateMetric,
+)
+from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric
+from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import (
     SuccessfulRecommendationRoundRatioMetric,
 )
 from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface
 
+UTILITY_METRICS = {
+    "success_rate",
+    "successful_recommendation_round_ratio",
+    "reward_per_dialogue_length",
+}
+
 SUPPORTED_METRICS = [
     "quality",
     "satisfaction",
@@ -115,27 +124,23 @@ def _validate_args(args: argparse.Namespace) -> None:
             "The --ollama_config argument is required when using quality."
         )
 
-    utility_metrics = {
-        "success_rate",
-        "successful_recommendation_round_ratio",
-        "reward_per_dialogue_length",
-    }
-    if utility_metrics.intersection(set(args.metrics)):
+    if UTILITY_METRICS.intersection(set(args.metrics)):
         if not args.user_nlu_config or not args.agent_nlu_config:
             raise ValueError(
                 "Both --user_nlu_config and --agent_nlu_config are required "
                 "for utility metrics."
             )
 
+    supported_aspect_names = [aspect.name for aspect in QualityRubrics]
     invalid_aspects = [
         aspect
         for aspect in args.quality_aspects
-        if aspect not in [enum_aspect.name for enum_aspect in QualityRubrics]
+        if aspect not in supported_aspect_names
     ]
     if invalid_aspects:
         raise ValueError(
             f"Unknown quality aspect(s): {invalid_aspects}. "
-            f"Supported aspects: {[aspect.name for aspect in QualityRubrics]}"
+            f"Supported aspects: {supported_aspect_names}"
         )
 
 
@@ -153,22 +158,13 @@ def _build_metric_registry(args: argparse.Namespace) -> Dict[str, BaseMetric]:
             classifier=SatisfactionClassifierSVM()
         )
     if "success_rate" in args.metrics:
-        registry["success_rate"] = SuccessRateMetric(
-            user_nlu_config_path=args.user_nlu_config,
-            agent_nlu_config_path=args.agent_nlu_config,
-        )
+        registry["success_rate"] = SuccessRateMetric()
     if "successful_recommendation_round_ratio" in args.metrics:
         registry[
             "successful_recommendation_round_ratio"
-        ] = SuccessfulRecommendationRoundRatioMetric(
-            user_nlu_config_path=args.user_nlu_config,
-            agent_nlu_config_path=args.agent_nlu_config,
-        )
+        ] = SuccessfulRecommendationRoundRatioMetric()
     if "reward_per_dialogue_length" in args.metrics:
-        registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric(
-            user_nlu_config_path=args.user_nlu_config,
-            agent_nlu_config_path=args.agent_nlu_config,
-        )
+        registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric()
     return registry
 
 
@@ -204,7 +200,7 @@ def _evaluate_metric(
 ) -> Dict[str, object]:
     """Runs one metric and returns per-dialogue scores and summary."""
     if metric_name == "quality":
-        per_aspect: Dict[str, Dict[str, Dict[str, float]]] = {}
+        per_aspect: Dict[str, Dict[str, Any]] = {}
         for aspect in args.quality_aspects:
             per_dialogue = metric.evaluate_dialogues(
                 list(dialogues),
@@ -219,11 +215,7 @@ def _evaluate_metric(
         return {"aspects": per_aspect}
 
     eval_kwargs = {}
-    if metric_name in {
-        "success_rate",
-        "successful_recommendation_round_ratio",
-        "reward_per_dialogue_length",
-    }:
+    if metric_name in UTILITY_METRICS:
         eval_kwargs = {
             "recommendation_intent_labels": args.recommendation_intent_labels,
             "acceptance_intent_labels": args.accept_intent_labels,
@@ -271,9 +263,15 @@ def main() -> None:
     _validate_args(args)
 
     dialogues = json_to_dialogues(args.dialogues)
+
+    if UTILITY_METRICS.intersection(set(args.metrics)):
+        annotate_dialogues(
+            dialogues, args.user_nlu_config, args.agent_nlu_config
+        )
+
     metric_registry = _build_metric_registry(args)
 
-    results: Dict[str, object] = {
+    results: Dict[str, Any] = {
         "dialogues_path": args.dialogues,
         "metrics_requested": args.metrics,
         "metrics": {},

From d749156c0a225b039452792698f47ef53d6e475a Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 10 Mar 2026 16:25:07 +0100
Subject: [PATCH 26/38] remove main

---
 usersimcrs/evaluation/main.py | 303 ----------------------------------
 1 file changed, 303 deletions(-)
 delete mode 100644 usersimcrs/evaluation/main.py

diff --git a/usersimcrs/evaluation/main.py b/usersimcrs/evaluation/main.py
deleted file mode 100644
index 3372093e..00000000
--- a/usersimcrs/evaluation/main.py
+++ /dev/null
@@ -1,303 +0,0 @@
-"""Unified script for evaluating dialogues with selected metrics."""
-
-import argparse
-import json
-import os
-from collections import defaultdict
-from statistics import mean, stdev
-from typing import Dict, List, Mapping, Sequence
-
-from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.nlu.models.satisfaction_classifier import (
-    SatisfactionClassifierSVM,
-)
-from dialoguekit.utils.dialogue_reader import json_to_dialogues
-
-from usersimcrs.evaluation.base_metric import BaseMetric
-from usersimcrs.evaluation.quality_metric import QualityMetric
-from usersimcrs.evaluation.quality_rubrics import QualityRubrics
-from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric
-from usersimcrs.evaluation.reward_per_dialogue_length_metric import (
-    RewardPerDialogueLengthMetric,
-)
-from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric
-from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import (
-    SuccessfulRecommendationRoundRatioMetric,
-)
-from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface
-
-SUPPORTED_METRICS = [
-    "quality",
-    "satisfaction",
-    "success_rate",
-    "successful_recommendation_round_ratio",
-    "reward_per_dialogue_length",
-]
-
-
-def parse_args() -> argparse.Namespace:
-    """Parses command-line arguments."""
-    parser = argparse.ArgumentParser(prog="usersimcrs.evaluation.main")
-    parser.add_argument(
-        "--dialogues",
-        type=str,
-        required=True,
-        help="Path to the dialogues JSON file.",
-    )
-    parser.add_argument(
-        "--metrics",
-        nargs="+",
-        required=True,
-        choices=SUPPORTED_METRICS,
-        help="List of metrics to compute.",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        required=True,
-        help="Path to save evaluation results as JSON.",
-    )
-    parser.add_argument(
-        "--ollama_config",
-        type=str,
-        help="Path to Ollama config file (required when quality is selected).",
-    )
-    parser.add_argument(
-        "--quality_aspects",
-        nargs="+",
-        default=[aspect.name for aspect in QualityRubrics],
-        help=(
-            "Quality aspects to evaluate. "
-            "Defaults to all aspects in QualityRubrics."
-        ),
-    )
-    parser.add_argument(
-        "--user_nlu_config",
-        type=str,
-        help=(
-            "Path to user NLU config (required for utility metrics: "
-            "success_rate, successful_recommendation_round_ratio, "
-            "reward_per_dialogue_length)."
-        ),
-    )
-    parser.add_argument(
-        "--agent_nlu_config",
-        type=str,
-        help=(
-            "Path to agent NLU config (required for utility metrics: "
-            "success_rate, successful_recommendation_round_ratio, "
-            "reward_per_dialogue_length)."
-        ),
-    )
-    parser.add_argument(
-        "--reject_intent_labels",
-        nargs="+",
-        default=["REJ"],
-        help="Intent labels corresponding to rejection.",
-    )
-    parser.add_argument(
-        "--accept_intent_labels",
-        nargs="+",
-        default=["ACC"],
-        help="Intent labels corresponding to acceptance.",
-    )
-    parser.add_argument(
-        "--recommendation_intent_labels",
-        nargs="+",
-        default=["REC-S", "REC-E"],
-        help="Intent labels corresponding to recommendation.",
-    )
-    return parser.parse_args()
-
-
-def _validate_args(args: argparse.Namespace) -> None:
-    """Validates metric-specific CLI requirements."""
-    if "quality" in args.metrics and not args.ollama_config:
-        raise ValueError(
-            "The --ollama_config argument is required when using quality."
-        )
-
-    utility_metrics = {
-        "success_rate",
-        "successful_recommendation_round_ratio",
-        "reward_per_dialogue_length",
-    }
-    if utility_metrics.intersection(set(args.metrics)):
-        if not args.user_nlu_config or not args.agent_nlu_config:
-            raise ValueError(
-                "Both --user_nlu_config and --agent_nlu_config are required "
-                "for utility metrics."
-            )
-
-    invalid_aspects = [
-        aspect
-        for aspect in args.quality_aspects
-        if aspect not in [enum_aspect.name for enum_aspect in QualityRubrics]
-    ]
-    if invalid_aspects:
-        raise ValueError(
-            f"Unknown quality aspect(s): {invalid_aspects}. "
-            f"Supported aspects: {[aspect.name for aspect in QualityRubrics]}"
-        )
-
-
-def _build_metric_registry(args: argparse.Namespace) -> Dict[str, BaseMetric]:
-    """Builds metric instances keyed by metric name."""
-    registry: Dict[str, BaseMetric] = {}
-    if "quality" in args.metrics:
-        llm_interface = OllamaLLMInterface(
-            configuration_path=args.ollama_config,
-            default_response="",
-        )
-        registry["quality"] = QualityMetric(llm_interface=llm_interface)
-    if "satisfaction" in args.metrics:
-        registry["satisfaction"] = SatisfactionMetric(
-            classifier=SatisfactionClassifierSVM()
-        )
-    if "success_rate" in args.metrics:
-        registry["success_rate"] = SuccessRateMetric(
-            user_nlu_config_path=args.user_nlu_config,
-            agent_nlu_config_path=args.agent_nlu_config,
-        )
-    if "successful_recommendation_round_ratio" in args.metrics:
-        registry[
-            "successful_recommendation_round_ratio"
-        ] = SuccessfulRecommendationRoundRatioMetric(
-            user_nlu_config_path=args.user_nlu_config,
-            agent_nlu_config_path=args.agent_nlu_config,
-        )
-    if "reward_per_dialogue_length" in args.metrics:
-        registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric(
-            user_nlu_config_path=args.user_nlu_config,
-            agent_nlu_config_path=args.agent_nlu_config,
-        )
-    return registry
-
-
-def _summarize_by_agent(
-    dialogues: Sequence[Dialogue], scores: Mapping[str, float]
-) -> Dict[str, Dict[str, float]]:
-    """Returns aggregate statistics by agent."""
-    conversation_to_agent = {
-        dialogue.conversation_id: dialogue.agent_id for dialogue in dialogues
-    }
-    grouped_scores: Dict[str, List[float]] = defaultdict(list)
-    for conversation_id, score in scores.items():
-        agent_id = conversation_to_agent.get(conversation_id, "unknown")
-        grouped_scores[agent_id].append(score)
-
-    summary: Dict[str, Dict[str, float]] = {}
-    for agent_id, agent_scores in grouped_scores.items():
-        summary[agent_id] = {
-            "count": float(len(agent_scores)),
-            "min": min(agent_scores),
-            "max": max(agent_scores),
-            "mean": mean(agent_scores),
-            "stdev": stdev(agent_scores) if len(agent_scores) > 1 else 0.0,
-        }
-    return summary
-
-
-def _evaluate_metric(
-    metric_name: str,
-    metric: BaseMetric,
-    dialogues: Sequence[Dialogue],
-    args: argparse.Namespace,
-) -> Dict[str, object]:
-    """Runs one metric and returns per-dialogue scores and summary."""
-    if metric_name == "quality":
-        per_aspect: Dict[str, Dict[str, Dict[str, float]]] = {}
-        for aspect in args.quality_aspects:
-            per_dialogue = metric.evaluate_dialogues(
-                list(dialogues),
-                aspect=aspect,
-            )
-            per_aspect[aspect] = {
-                "per_dialogue": per_dialogue,
-                "summary_by_agent": _summarize_by_agent(
-                    dialogues, per_dialogue
-                ),
-            }
-        return {"aspects": per_aspect}
-
-    eval_kwargs = {}
-    if metric_name in {
-        "success_rate",
-        "successful_recommendation_round_ratio",
-        "reward_per_dialogue_length",
-    }:
-        eval_kwargs = {
-            "recommendation_intent_labels": args.recommendation_intent_labels,
-            "acceptance_intent_labels": args.accept_intent_labels,
-            "rejection_intent_labels": args.reject_intent_labels,
-        }
-
-    per_dialogue_scores = metric.evaluate_dialogues(
-        list(dialogues), **eval_kwargs
-    )
-    return {
-        "per_dialogue": per_dialogue_scores,
-        "summary_by_agent": _summarize_by_agent(dialogues, per_dialogue_scores),
-    }
-
-
-def _print_brief_summary(results: Mapping[str, object]) -> None:
-    """Prints a concise summary in the terminal."""
-    metric_results = results.get("metrics", {})
-    if not isinstance(metric_results, dict):
-        return
-    for metric_name, metric_result in metric_results.items():
-        print(f"Metric: {metric_name}")
-        if metric_name == "quality":
-            aspects = metric_result.get("aspects", {})
-            for aspect_name, aspect_result in aspects.items():
-                print(f"  Aspect: {aspect_name}")
-                for agent_id, stats in aspect_result[
-                    "summary_by_agent"
-                ].items():
-                    print(
-                        f"    Agent: {agent_id} | mean={stats['mean']:.3f} "
-                        f"stdev={stats['stdev']:.3f}"
-                    )
-            continue
-
-        for agent_id, stats in metric_result["summary_by_agent"].items():
-            print(
-                f"  Agent: {agent_id} | mean={stats['mean']:.3f} "
-                f"stdev={stats['stdev']:.3f}"
-            )
-
-
-def main() -> None:
-    args = parse_args()
-    _validate_args(args)
-
-    dialogues = json_to_dialogues(args.dialogues)
-    metric_registry = _build_metric_registry(args)
-
-    results: Dict[str, object] = {
-        "dialogues_path": args.dialogues,
-        "metrics_requested": args.metrics,
-        "metrics": {},
-    }
-
-    for metric_name in args.metrics:
-        metric = metric_registry[metric_name]
-        results["metrics"][metric_name] = _evaluate_metric(
-            metric_name,
-            metric,
-            dialogues,
-            args,
-        )
-
-    output_dir = os.path.dirname(args.output)
-    if output_dir:
-        os.makedirs(output_dir, exist_ok=True)
-    with open(args.output, "w") as f:
-        json.dump(results, f, indent=2)
-
-    _print_brief_summary(results)
-
-
-if __name__ == "__main__":
-    main()

From 8f333e74d01184f83c3999da985d6a82248bc415 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 10 Mar 2026 17:10:42 +0100
Subject: [PATCH 27/38] changes

---
 usersimcrs/evaluation/dialogue_annotation.py  | 57 +++++++++----------
 usersimcrs/evaluation/quality_metric.py       |  2 +-
 .../reward_per_dialogue_length_metric.py      | 14 ++++-
 usersimcrs/evaluation/success_rate_metric.py  | 19 ++++++-
 ...ssful_recommendation_round_ratio_metric.py | 15 ++++-
 usersimcrs/evaluation/utility_base_metric.py  |  5 +-
 6 files changed, 68 insertions(+), 44 deletions(-)

diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py
index f0d75fea..31c06018 100644
--- a/usersimcrs/evaluation/dialogue_annotation.py
+++ b/usersimcrs/evaluation/dialogue_annotation.py
@@ -24,7 +24,7 @@ def annotate_dialogue(
     """Annotates utterances with dialogue acts.
 
     Each utterance that is not already an AnnotatedUtterance is converted to
-    one.  Utterances that already carry dialogue acts are left untouched.
+    one. Utterances that already carry dialogue acts are left untouched.
 
     Args:
         dialogue: Dialogue to be annotated.
@@ -59,35 +59,29 @@ def annotate_dialogue(
     return dialogue
 
 
-def load_nlus(
-    user_nlu_config_path: str,
-    agent_nlu_config_path: str,
-    cached_user_nlu: Optional[NLU] = None,
-    cached_agent_nlu: Optional[NLU] = None,
-) -> Tuple[NLU, NLU]:
-    """Loads user and agent NLU modules.
+def load_nlu(
+    nlu_config_path: str,
+    config_name: str = "NLU Configuration",
+    cached_nlu: Optional[NLU] = None,
+) -> NLU:
+    """Loads a single NLU module.
 
-    Returns cached instances when provided, otherwise creates new ones
-    from the given configuration files.
+    Returns the cached instance when provided, otherwise creates a new one
+    from the given configuration file.
 
     Args:
-        user_nlu_config_path: Path to user NLU configuration file.
-        agent_nlu_config_path: Path to agent NLU configuration file.
-        cached_user_nlu: Previously loaded user NLU module.
-        cached_agent_nlu: Previously loaded agent NLU module.
+        nlu_config_path: Path to the NLU configuration file.
+        config_name: Name for the Configuration instance.
+        cached_nlu: Previously loaded NLU module.
 
     Returns:
-        Tuple of (user_nlu, agent_nlu) modules.
+        NLU module.
     """
-    if cached_user_nlu is None:
-        user_nlu_config = Configuration("User NLU Configuration")
-        user_nlu_config.set_file(user_nlu_config_path)
-        cached_user_nlu = get_NLU(user_nlu_config)
-    if cached_agent_nlu is None:
-        agent_nlu_config = Configuration("Agent NLU Configuration")
-        agent_nlu_config.set_file(agent_nlu_config_path)
-        cached_agent_nlu = get_NLU(agent_nlu_config)
-    return cached_user_nlu, cached_agent_nlu
+    if cached_nlu is not None:
+        return cached_nlu
+    nlu_config = Configuration(config_name)
+    nlu_config.set_file(nlu_config_path)
+    return get_NLU(nlu_config)
 
 
 def get_intent_lists(
@@ -130,7 +124,8 @@ def annotate_dialogues(
         user_nlu_config_path: Path to user NLU configuration file.
         agent_nlu_config_path: Path to agent NLU configuration file.
     """
-    user_nlu, agent_nlu = load_nlus(user_nlu_config_path, agent_nlu_config_path)
+    user_nlu = load_nlu(user_nlu_config_path, "User NLU Configuration")
+    agent_nlu = load_nlu(agent_nlu_config_path, "Agent NLU Configuration")
     for dialogue in dialogues:
         annotate_dialogue(dialogue, user_nlu, agent_nlu)
 
@@ -175,7 +170,7 @@ def prepare_dialogue(
 ) -> Tuple[Dialogue, List[Intent], List[Intent], List[Intent], NLU, NLU]:
     """Loads NLU modules, annotates a dialogue, and builds intent lists.
 
-    Combines :func:`load_nlus`, :func:`annotate_dialogue`, and
+    Combines :func:`load_nlu`, :func:`annotate_dialogue`, and
     :func:`get_intent_lists` into a single convenience call.
 
     Args:
@@ -191,11 +186,11 @@ def prepare_dialogue(
         Tuple of (annotated dialogue, recommendation intents,
         acceptance intents, rejection intents, user NLU, agent NLU).
     """
-    user_nlu, agent_nlu = load_nlus(
-        user_nlu_config_path,
-        agent_nlu_config_path,
-        cached_user_nlu,
-        cached_agent_nlu,
+    user_nlu = load_nlu(
+        user_nlu_config_path, "User NLU Configuration", cached_user_nlu
+    )
+    agent_nlu = load_nlu(
+        agent_nlu_config_path, "Agent NLU Configuration", cached_agent_nlu
     )
     annotate_dialogue(dialogue, user_nlu, agent_nlu)
     rec, acc, rej = get_intent_lists(**kwargs)
diff --git a/usersimcrs/evaluation/quality_metric.py b/usersimcrs/evaluation/quality_metric.py
index 3198c77b..acbfc595 100644
--- a/usersimcrs/evaluation/quality_metric.py
+++ b/usersimcrs/evaluation/quality_metric.py
@@ -46,7 +46,7 @@ def __init__(
 
         Args:
             llm_interface: LLM interface used for scoring.
-            name: Metric name.
+            name: Metric name. Defaults to "quality".
         """
         super().__init__(name)
         self.llm_interface = llm_interface
diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
index a630b7cd..1727cc32 100644
--- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
+++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
@@ -60,11 +60,19 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
 
         Args:
             dialogue: Dialogue to evaluate.
-            **kwargs: Optional intent label overrides.
+            **kwargs: Optional intent label overrides:
+                - recommendation_intent_labels: Labels for recommendation
+                  intents. Defaults to ["REC-S", "REC-E"].
+                - acceptance_intent_labels: Labels for acceptance intents.
+                  Defaults to ["ACC"].
+                - rejection_intent_labels: Labels for rejection intents.
+                  Defaults to ["REJ"].
 
         Returns:
             Ratio of accepted recommendations to total utterances.
         """
-        _, acc, _ = self._resolve_intents(dialogue, **kwargs)
-        nb_accepted, dialogue_length = self._assess_dialogue(dialogue, acc)
+        _, acc, _ = self._resolve_intents(dialogue=dialogue, **kwargs)
+        nb_accepted, dialogue_length = self._assess_dialogue(
+            dialogue=dialogue, acceptance_intents=acc
+        )
         return nb_accepted / dialogue_length
diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py
index c4735111..e0bb9566 100644
--- a/usersimcrs/evaluation/success_rate_metric.py
+++ b/usersimcrs/evaluation/success_rate_metric.py
@@ -66,10 +66,23 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
 
         Args:
             dialogue: Dialogue to evaluate.
-            **kwargs: Optional intent label overrides.
+            **kwargs: Optional intent label overrides:
+                - recommendation_intent_labels: Labels for recommendation
+                  intents. Defaults to ["REC-S", "REC-E"].
+                - acceptance_intent_labels: Labels for acceptance intents.
+                  Defaults to ["ACC"].
+                - rejection_intent_labels: Labels for rejection intents.
+                  Defaults to ["REJ"].
 
         Returns:
             1.0 if at least one recommendation was accepted, 0.0 otherwise.
         """
-        rec, acc, rej = self._resolve_intents(dialogue, **kwargs)
-        return float(self._assess_dialogue(dialogue, rec, acc, rej))
+        rec, acc, rej = self._resolve_intents(dialogue=dialogue, **kwargs)
+        return float(
+            self._assess_dialogue(
+                dialogue=dialogue,
+                recommendation_intents=rec,
+                acceptance_intents=acc,
+                rejection_intents=rej,
+            )
+        )
diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
index dbd6c9f6..8f79a16b 100644
--- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
+++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
@@ -69,14 +69,23 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
 
         Args:
             dialogue: Dialogue to evaluate.
-            **kwargs: Optional intent label overrides.
+            **kwargs: Optional intent label overrides:
+                - recommendation_intent_labels: Labels for recommendation
+                  intents. Defaults to ``["REC-S", "REC-E"]``.
+                - acceptance_intent_labels: Labels for acceptance intents.
+                  Defaults to ``["ACC"]``.
+                - rejection_intent_labels: Labels for rejection intents.
+                  Defaults to ``["REJ"]``.
 
         Returns:
             Ratio of accepted recommendation rounds to total rounds,
             or 0.0 if there are no recommendation rounds.
         """
-        rec, acc, rej = self._resolve_intents(dialogue, **kwargs)
+        rec, acc, rej = self._resolve_intents(dialogue=dialogue, **kwargs)
         successful_rounds, total_rounds = self._assess_dialogue(
-            dialogue, rec, acc, rej
+            dialogue=dialogue,
+            recommendation_intents=rec,
+            acceptance_intents=acc,
+            rejection_intents=rej,
         )
         return successful_rounds / total_rounds if total_rounds > 0 else 0.0
diff --git a/usersimcrs/evaluation/utility_base_metric.py b/usersimcrs/evaluation/utility_base_metric.py
index f4ca09e2..17618b65 100644
--- a/usersimcrs/evaluation/utility_base_metric.py
+++ b/usersimcrs/evaluation/utility_base_metric.py
@@ -1,4 +1,4 @@
-"""Base class for utility metrics that require NLU annotation."""
+"""Base class for dialogue annotation support."""
 
 from abc import ABC
 from typing import Any, List, Optional, Tuple
@@ -18,8 +18,7 @@ class UtilityBaseMetric(BaseMetric, ABC):
     """Shared base for metrics that optionally annotate dialogues via NLU.
 
     When NLU config paths are provided, dialogues are annotated automatically.
-    When omitted, dialogues must be pre-annotated (e.g., via
-    :func:`annotate_dialogues`).
+    When omitted, dialogues must be pre-annotated.
     """
 
     def __init__(

From 6e6e8b2b197b88b63be17efddf344383ea01a96d Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 10 Mar 2026 17:23:35 +0100
Subject: [PATCH 28/38] changes

---
 tests/evaluation/__init__.py                 |   0
 tests/evaluation/test_quality_metric.py      | 115 +++++++++++++++++++
 tests/evaluation/test_satisfaction_metric.py |  80 +++++++++++++
 3 files changed, 195 insertions(+)
 create mode 100644 tests/evaluation/__init__.py
 create mode 100644 tests/evaluation/test_quality_metric.py
 create mode 100644 tests/evaluation/test_satisfaction_metric.py

diff --git a/tests/evaluation/__init__.py b/tests/evaluation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/evaluation/test_quality_metric.py b/tests/evaluation/test_quality_metric.py
new file mode 100644
index 00000000..3fdf40ec
--- /dev/null
+++ b/tests/evaluation/test_quality_metric.py
@@ -0,0 +1,115 @@
+"""Tests for QualityMetric."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from dialoguekit.utils.dialogue_reader import json_to_dialogues
+
+from usersimcrs.evaluation.quality_metric import QualityMetric
+from usersimcrs.evaluation.quality_rubrics import QualityRubrics
+from usersimcrs.llm_interfaces.llm_interface import LLMInterface
+
+
+@pytest.fixture
+def dialogues():
+    """Load test dialogues."""
+    return json_to_dialogues(
+        "tests/data/annotated_dialogues.json",
+        agent_ids=["Agent"],
+        user_ids=["User"],
+    )
+
+
+@pytest.fixture
+def mock_llm_interface():
+    """Mock LLM interface."""
+    return MagicMock(spec=LLMInterface)
+
+
+@pytest.fixture
+def quality_metric(mock_llm_interface):
+    """QualityMetric instance with mocked LLM."""
+    return QualityMetric(llm_interface=mock_llm_interface)
+
+
+def test_quality_metric_init(mock_llm_interface) -> None:
+    """Test QualityMetric initializes with correct name and LLM."""
+    metric = QualityMetric(llm_interface=mock_llm_interface)
+    assert metric.name == "quality"
+    assert metric.llm_interface is mock_llm_interface
+
+
+def test_quality_metric_custom_name(mock_llm_interface) -> None:
+    """Test QualityMetric accepts custom name."""
+    metric = QualityMetric(llm_interface=mock_llm_interface, name="custom")
+    assert metric.name == "custom"
+
+
+def test_get_prompt(quality_metric, dialogues) -> None:
+    """Test _get_prompt builds prompt with dialogue and rubric."""
+    dialogue = dialogues[0]
+    prompt = quality_metric._get_prompt(QualityRubrics.FLUENCY, dialogue)
+
+    assert "CONVERSATION HISTORY" in prompt
+    assert "USER: Utterance 1" in prompt
+    assert "ASSISTANT: Utterance 2" in prompt
+    assert "GRADING RUBRIC" in prompt
+    assert "Fluency" in prompt
+    assert '{"score"' in prompt
+
+
+def test_get_prompt_all_aspects(quality_metric, dialogues) -> None:
+    """Test _get_prompt works for every quality aspect."""
+    dialogue = dialogues[0]
+    for aspect in QualityRubrics:
+        prompt = quality_metric._get_prompt(aspect, dialogue)
+        assert aspect.value in prompt
+
+
+def test_evaluate_dialogue_valid_response(quality_metric, dialogues) -> None:
+    """Test evaluate_dialogue parses a valid LLM JSON response."""
+    quality_metric.llm_interface.get_llm_api_response.return_value = (
+        '{"score": 4, "score_explanation": "Good fluency."}'
+    )
+    score = quality_metric.evaluate_dialogue(dialogues[0], aspect="FLUENCY")
+    assert score == 4.0
+
+
+def test_evaluate_dialogue_all_aspects(quality_metric, dialogues) -> None:
+    """Test evaluate_dialogue succeeds for each aspect name."""
+    quality_metric.llm_interface.get_llm_api_response.return_value = (
+        '{"score": 3, "score_explanation": "Average."}'
+    )
+    for aspect in QualityRubrics:
+        score = quality_metric.evaluate_dialogue(
+            dialogues[0], aspect=aspect.name
+        )
+        assert score == 3.0
+
+
+def test_evaluate_dialogue_missing_score_key(quality_metric, dialogues) -> None:
+    """Test evaluate_dialogue returns 0.0 when 'score' key is missing."""
+    quality_metric.llm_interface.get_llm_api_response.return_value = (
+        '{"explanation": "No score field."}'
+    )
+    score = quality_metric.evaluate_dialogue(dialogues[0], aspect="FLUENCY")
+    assert score == 0.0
+
+
+def test_evaluate_dialogue_unknown_aspect(quality_metric, dialogues) -> None:
+    """Test evaluate_dialogue raises KeyError for unsupported aspect."""
+    with pytest.raises(KeyError, match="Unknown aspect"):
+        quality_metric.evaluate_dialogue(dialogues[0], aspect="NONEXISTENT")
+
+
+def test_evaluate_dialogues(quality_metric, dialogues) -> None:
+    """Test evaluate_dialogues returns scores keyed by conversation ID."""
+    quality_metric.llm_interface.get_llm_api_response.return_value = (
+        '{"score": 5, "score_explanation": "Excellent."}'
+    )
+    results = quality_metric.evaluate_dialogues(dialogues, aspect="OVERALL_SAT")
+    assert len(results) == len(dialogues)
+    for dialogue in dialogues:
+        assert dialogue.conversation_id in results
+        assert results[dialogue.conversation_id] == 5.0
diff --git a/tests/evaluation/test_satisfaction_metric.py b/tests/evaluation/test_satisfaction_metric.py
new file mode 100644
index 00000000..bd857060
--- /dev/null
+++ b/tests/evaluation/test_satisfaction_metric.py
@@ -0,0 +1,80 @@
+"""Tests for SatisfactionMetric."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from dialoguekit.nlu.models.satisfaction_classifier import (
+    SatisfactionClassifier,
+)
+from dialoguekit.utils.dialogue_reader import json_to_dialogues
+
+from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric
+
+
+@pytest.fixture
+def dialogues():
+    """Load test dialogues."""
+    return json_to_dialogues(
+        "tests/data/annotated_dialogues.json",
+        agent_ids=["Agent"],
+        user_ids=["User"],
+    )
+
+
+@pytest.fixture
+def mock_classifier():
+    """Mock satisfaction classifier."""
+    return MagicMock(spec=SatisfactionClassifier)
+
+
+@pytest.fixture
+def satisfaction_metric(mock_classifier):
+    """SatisfactionMetric instance with mocked classifier."""
+    return SatisfactionMetric(classifier=mock_classifier)
+
+
+def test_satisfaction_metric_init(mock_classifier) -> None:
+    """Test SatisfactionMetric initializes with correct name."""
+    metric = SatisfactionMetric(classifier=mock_classifier)
+    assert metric.name == "satisfaction"
+    assert metric.classifier is mock_classifier
+
+
+def test_satisfaction_metric_custom_name(mock_classifier) -> None:
+    """Test SatisfactionMetric accepts custom name."""
+    metric = SatisfactionMetric(classifier=mock_classifier, name="sat_v2")
+    assert metric.name == "sat_v2"
+
+
+def test_evaluate_dialogue(satisfaction_metric, dialogues) -> None:
+    """Test evaluate_dialogue returns classifier score as float."""
+    satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 4
+    score = satisfaction_metric.evaluate_dialogue(dialogues[0])
+    assert score == 4.0
+    classify = satisfaction_metric.classifier.classify_last_n_dialogue
+    classify.assert_called_once_with(dialogues[0], last_n=None)
+
+
+def test_evaluate_dialogue_low_score(satisfaction_metric, dialogues) -> None:
+    """Test evaluate_dialogue with a low satisfaction score."""
+    satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 1
+    score = satisfaction_metric.evaluate_dialogue(dialogues[0])
+    assert score == 1.0
+
+
+def test_evaluate_dialogue_float_score(satisfaction_metric, dialogues) -> None:
+    """Test evaluate_dialogue handles fractional classifier output."""
+    satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 3.7
+    score = satisfaction_metric.evaluate_dialogue(dialogues[0])
+    assert score == pytest.approx(3.7)
+
+
+def test_evaluate_dialogues(satisfaction_metric, dialogues) -> None:
+    """Test evaluate_dialogues returns scores keyed by conversation ID."""
+    satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 3
+    results = satisfaction_metric.evaluate_dialogues(dialogues)
+    assert len(results) == len(dialogues)
+    for dialogue in dialogues:
+        assert dialogue.conversation_id in results
+        assert results[dialogue.conversation_id] == 3.0

From 1c802ffd1f39636fc696f8d4a415c8552f5da14c Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 10 Mar 2026 17:40:02 +0100
Subject: [PATCH 29/38] changes

---
 tests/evaluation/test_utility_metric.py       | 152 ++++++++++--------
 usersimcrs/evaluation/dialogue_annotation.py  |  37 -----
 .../reward_per_dialogue_length_metric.py      |  10 +-
 ...ssful_recommendation_round_ratio_metric.py |  12 +-
 usersimcrs/evaluation/utility_base_metric.py  |  22 ++-
 5 files changed, 101 insertions(+), 132 deletions(-)

diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py
index 72e221fc..aff046bc 100644
--- a/tests/evaluation/test_utility_metric.py
+++ b/tests/evaluation/test_utility_metric.py
@@ -14,10 +14,8 @@
     SuccessfulRecommendationRoundRatioMetric,
 )
 
-_MOCK_NLU = MagicMock()
-_PREPARE_DIALOGUE_PATH = (
-    "usersimcrs.evaluation.utility_base_metric.prepare_dialogue"
-)
+_LOAD_NLU_PATH = "usersimcrs.evaluation.utility_base_metric.load_nlu"
+_ANNOTATE_PATH = "usersimcrs.evaluation.utility_base_metric.annotate_dialogue"
 
 
 @pytest.fixture
@@ -38,88 +36,110 @@ def success_rate_metric():
     )
 
 
-@pytest.fixture
-def successful_round_ratio_metric():
-    return SuccessfulRecommendationRoundRatioMetric(
-        user_nlu_config_path="dummy_user_nlu.yaml",
-        agent_nlu_config_path="dummy_agent_nlu.yaml",
-    )
-
+def test_success_rate_init() -> None:
+    """Test SuccessRateMetric default and custom name."""
+    metric = SuccessRateMetric()
+    assert metric.name == "success_rate"
 
-@pytest.fixture
-def reward_per_dialogue_length_metric():
-    return RewardPerDialogueLengthMetric(
-        user_nlu_config_path="dummy_user_nlu.yaml",
-        agent_nlu_config_path="dummy_agent_nlu.yaml",
-    )
+    metric = SuccessRateMetric(name="custom_sr")
+    assert metric.name == "custom_sr"
 
 
 def test_success_rate_evaluate_dialogue(
     success_rate_metric: SuccessRateMetric, dialogues
 ) -> None:
-    """Test SuccessRateMetric.evaluate_dialogue."""
+    """Test SuccessRateMetric returns 1.0 for accepted dialogue."""
     dialogue = dialogues[0]
+    mock_nlu = MagicMock()
     with (
-        patch(
-            _PREPARE_DIALOGUE_PATH,
-            return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU),
+        patch(_LOAD_NLU_PATH, return_value=mock_nlu),
+        patch(_ANNOTATE_PATH),
+        patch.object(
+            SuccessRateMetric,
+            "_assess_dialogue",
+            return_value=True,
         ),
-        patch.object(SuccessRateMetric, "_assess_dialogue", return_value=True),
     ):
         assert success_rate_metric.evaluate_dialogue(dialogue) == 1.0
 
 
-def test_success_rate_evaluate_dialogue_unsuccessful(
-    success_rate_metric: SuccessRateMetric, dialogues
-) -> None:
-    """Test SuccessRateMetric.evaluate_dialogue for failed dialogue."""
+def test_success_rate_without_nlu_paths(dialogues) -> None:
+    """Test SuccessRateMetric works on pre-annotated dialogues."""
+    metric = SuccessRateMetric()
     dialogue = dialogues[0]
-    with (
-        patch(
-            _PREPARE_DIALOGUE_PATH,
-            return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU),
-        ),
-        patch.object(SuccessRateMetric, "_assess_dialogue", return_value=False),
+    with patch.object(SuccessRateMetric, "_assess_dialogue", return_value=True):
+        assert metric.evaluate_dialogue(dialogue) == 1.0
+
+
+@pytest.fixture
+def srrr_metric():
+    return SuccessfulRecommendationRoundRatioMetric()
+
+
+def test_srrr_init() -> None:
+    """Test SRRR metric default and custom name."""
+    metric = SuccessfulRecommendationRoundRatioMetric()
+    assert metric.name == "successful_recommendation_round_ratio"
+
+    metric = SuccessfulRecommendationRoundRatioMetric(name="srrr_v2")
+    assert metric.name == "srrr_v2"
+
+
+def test_srrr_evaluate_dialogue(srrr_metric, dialogues) -> None:
+    """Test SRRR returns correct ratio."""
+    dialogue = dialogues[0]
+    with patch.object(
+        SuccessfulRecommendationRoundRatioMetric,
+        "_assess_dialogue",
+        return_value=(1, 2),
     ):
-        assert success_rate_metric.evaluate_dialogue(dialogue) == 0.0
+        assert srrr_metric.evaluate_dialogue(dialogue) == 0.5
 
 
-def test_successful_recommendation_round_ratio_evaluate_dialogue(
-    successful_round_ratio_metric: SuccessfulRecommendationRoundRatioMetric,
-    dialogues,
-) -> None:
-    """Test SuccessfulRecommendationRoundRatioMetric.evaluate_dialogue."""
+def test_srrr_all_rounds_successful(srrr_metric, dialogues) -> None:
+    """Test SRRR returns 1.0 when all rounds accepted."""
     dialogue = dialogues[0]
-    with (
-        patch(
-            _PREPARE_DIALOGUE_PATH,
-            return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU),
-        ),
-        patch.object(
-            SuccessfulRecommendationRoundRatioMetric,
-            "_assess_dialogue",
-            return_value=(1, 2),
-        ),
+    with patch.object(
+        SuccessfulRecommendationRoundRatioMetric,
+        "_assess_dialogue",
+        return_value=(3, 3),
     ):
-        assert successful_round_ratio_metric.evaluate_dialogue(dialogue) == 0.5
+        assert srrr_metric.evaluate_dialogue(dialogue) == 1.0
 
 
-def test_reward_per_dialogue_length_evaluate_dialogue(
-    reward_per_dialogue_length_metric: RewardPerDialogueLengthMetric, dialogues
-) -> None:
-    """Test RewardPerDialogueLengthMetric.evaluate_dialogue."""
+def test_srrr_no_successful_rounds(srrr_metric, dialogues) -> None:
+    """Test SRRR returns 0.0 when no rounds are accepted."""
     dialogue = dialogues[0]
-    with (
-        patch(
-            _PREPARE_DIALOGUE_PATH,
-            return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU),
-        ),
-        patch.object(
-            RewardPerDialogueLengthMetric,
-            "_assess_dialogue",
-            return_value=(1, 10),
-        ),
+    with patch.object(
+        SuccessfulRecommendationRoundRatioMetric,
+        "_assess_dialogue",
+        return_value=(0, 4),
+    ):
+        assert srrr_metric.evaluate_dialogue(dialogue) == 0.0
+
+
+@pytest.fixture
+def rdl_metric():
+    return RewardPerDialogueLengthMetric()
+
+
+def test_rdl_no_accepted(rdl_metric, dialogues) -> None:
+    """Test RDL returns 0.0 when no recommendations accepted."""
+    dialogue = dialogues[0]
+    with patch.object(
+        RewardPerDialogueLengthMetric,
+        "_assess_dialogue",
+        return_value=(0, 7),
+    ):
+        assert rdl_metric.evaluate_dialogue(dialogue) == 0.0
+
+
+def test_rdl_multiple_accepted(rdl_metric, dialogues) -> None:
+    """Test RDL with several accepted recommendations."""
+    dialogue = dialogues[0]
+    with patch.object(
+        RewardPerDialogueLengthMetric,
+        "_assess_dialogue",
+        return_value=(3, 10),
     ):
-        assert (
-            reward_per_dialogue_length_metric.evaluate_dialogue(dialogue) == 0.1
-        )
+        assert rdl_metric.evaluate_dialogue(dialogue) == pytest.approx(0.3)
diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py
index 31c06018..4d4ec9c6 100644
--- a/usersimcrs/evaluation/dialogue_annotation.py
+++ b/usersimcrs/evaluation/dialogue_annotation.py
@@ -160,43 +160,6 @@ def get_recommendation_rounds(
     return rounds
 
 
-def prepare_dialogue(
-    dialogue: Dialogue,
-    user_nlu_config_path: str,
-    agent_nlu_config_path: str,
-    cached_user_nlu: Optional[NLU] = None,
-    cached_agent_nlu: Optional[NLU] = None,
-    **kwargs: Any,
-) -> Tuple[Dialogue, List[Intent], List[Intent], List[Intent], NLU, NLU]:
-    """Loads NLU modules, annotates a dialogue, and builds intent lists.
-
-    Combines :func:`load_nlu`, :func:`annotate_dialogue`, and
-    :func:`get_intent_lists` into a single convenience call.
-
-    Args:
-        dialogue: Dialogue to prepare.
-        user_nlu_config_path: Path to user NLU configuration file.
-        agent_nlu_config_path: Path to agent NLU configuration file.
-        cached_user_nlu: Previously loaded user NLU module (avoids reload).
-        cached_agent_nlu: Previously loaded agent NLU module (avoids reload).
-        **kwargs: Optional intent label overrides forwarded to
-            :func:`get_intent_lists`.
-
-    Returns:
-        Tuple of (annotated dialogue, recommendation intents,
-        acceptance intents, rejection intents, user NLU, agent NLU).
-    """
-    user_nlu = load_nlu(
-        user_nlu_config_path, "User NLU Configuration", cached_user_nlu
-    )
-    agent_nlu = load_nlu(
-        agent_nlu_config_path, "Agent NLU Configuration", cached_agent_nlu
-    )
-    annotate_dialogue(dialogue, user_nlu, agent_nlu)
-    rec, acc, rej = get_intent_lists(**kwargs)
-    return dialogue, rec, acc, rej, user_nlu, agent_nlu
-
-
 def is_recommendation_accepted(
     round_utterances: List[AnnotatedUtterance],
     acceptance_intents: List[Intent],
diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
index 1727cc32..aca86dc6 100644
--- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
+++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
@@ -3,7 +3,7 @@
 Evaluates the ratio of accepted recommendations to total dialogue length.
 """
 
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Tuple
 
 from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.core.intent import Intent
@@ -15,8 +15,6 @@
 class RewardPerDialogueLengthMetric(UtilityBaseMetric):
     def __init__(
         self,
-        user_nlu_config_path: Optional[str] = None,
-        agent_nlu_config_path: Optional[str] = None,
         name: str = "reward_per_dialogue_length",
     ) -> None:
         """Initializes the reward-per-dialogue-length metric.
@@ -26,11 +24,7 @@ def __init__(
             agent_nlu_config_path: Path to agent NLU configuration.
             name: Metric name.
         """
-        super().__init__(
-            name,
-            user_nlu_config_path=user_nlu_config_path,
-            agent_nlu_config_path=agent_nlu_config_path,
-        )
+        super().__init__(name)
 
     def _assess_dialogue(
         self, dialogue: Dialogue, acceptance_intents: List[Intent]
diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
index 8f79a16b..09295572 100644
--- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
+++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
@@ -4,7 +4,7 @@
 rounds in a dialogue.
 """
 
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Tuple
 
 from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.core.intent import Intent
@@ -19,8 +19,6 @@
 class SuccessfulRecommendationRoundRatioMetric(UtilityBaseMetric):
     def __init__(
         self,
-        user_nlu_config_path: Optional[str] = None,
-        agent_nlu_config_path: Optional[str] = None,
         name: str = "successful_recommendation_round_ratio",
     ) -> None:
         """Initializes the successful recommendation round ratio metric.
@@ -32,8 +30,6 @@ def __init__(
         """
         super().__init__(
             name,
-            user_nlu_config_path=user_nlu_config_path,
-            agent_nlu_config_path=agent_nlu_config_path,
         )
 
     def _assess_dialogue(
@@ -71,11 +67,11 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
             dialogue: Dialogue to evaluate.
             **kwargs: Optional intent label overrides:
                 - recommendation_intent_labels: Labels for recommendation
-                  intents. Defaults to ``["REC-S", "REC-E"]``.
+                  intents. Defaults to ["REC-S", "REC-E"].
                 - acceptance_intent_labels: Labels for acceptance intents.
-                  Defaults to ``["ACC"]``.
+                  Defaults to ["ACC"].
                 - rejection_intent_labels: Labels for rejection intents.
-                  Defaults to ``["REJ"]``.
+                  Defaults to ["REJ"].
 
         Returns:
             Ratio of accepted recommendation rounds to total rounds,
diff --git a/usersimcrs/evaluation/utility_base_metric.py b/usersimcrs/evaluation/utility_base_metric.py
index 17618b65..c8a53b29 100644
--- a/usersimcrs/evaluation/utility_base_metric.py
+++ b/usersimcrs/evaluation/utility_base_metric.py
@@ -9,8 +9,9 @@
 
 from usersimcrs.evaluation.base_metric import BaseMetric
 from usersimcrs.evaluation.dialogue_annotation import (
+    annotate_dialogue,
     get_intent_lists,
-    prepare_dialogue,
+    load_nlu,
 )
 
 
@@ -55,20 +56,15 @@ def _resolve_intents(
             rejection_intents).
         """
         if self._user_nlu_config_path and self._agent_nlu_config_path:
-            (
-                _,
-                rec,
-                acc,
-                rej,
-                self._user_nlu,
-                self._agent_nlu,
-            ) = prepare_dialogue(
-                dialogue,
+            self._user_nlu = load_nlu(
                 self._user_nlu_config_path,
-                self._agent_nlu_config_path,
+                "User NLU Configuration",
                 self._user_nlu,
+            )
+            self._agent_nlu = load_nlu(
+                self._agent_nlu_config_path,
+                "Agent NLU Configuration",
                 self._agent_nlu,
-                **kwargs,
             )
-            return rec, acc, rej
+            annotate_dialogue(dialogue, self._user_nlu, self._agent_nlu)
         return get_intent_lists(**kwargs)

From 833500529752461d48f57246b0904c25ec82e40c Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 17 Mar 2026 13:23:25 +0100
Subject: [PATCH 30/38] simplyfying

---
 tests/evaluation/__init__.py                  |   0
 tests/evaluation/test_quality_metric.py       | 115 --------------
 tests/evaluation/test_satisfaction_metric.py  |  80 ----------
 tests/evaluation/test_utility_metric.py       | 145 ------------------
 usersimcrs/evaluation/dialogue_annotation.py  |  67 ++++----
 .../reward_per_dialogue_length_metric.py      |  63 +++-----
 usersimcrs/evaluation/success_rate_metric.py  |  68 ++++----
 ...ssful_recommendation_round_ratio_metric.py |  80 ++++------
 usersimcrs/evaluation/utility_base.py         |  56 +++++++
 usersimcrs/evaluation/utility_base_metric.py  |  70 ---------
 10 files changed, 161 insertions(+), 583 deletions(-)
 delete mode 100644 tests/evaluation/__init__.py
 delete mode 100644 tests/evaluation/test_quality_metric.py
 delete mode 100644 tests/evaluation/test_satisfaction_metric.py
 delete mode 100644 tests/evaluation/test_utility_metric.py
 create mode 100644 usersimcrs/evaluation/utility_base.py
 delete mode 100644 usersimcrs/evaluation/utility_base_metric.py

diff --git a/tests/evaluation/__init__.py b/tests/evaluation/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/evaluation/test_quality_metric.py b/tests/evaluation/test_quality_metric.py
deleted file mode 100644
index 3fdf40ec..00000000
--- a/tests/evaluation/test_quality_metric.py
+++ /dev/null
@@ -1,115 +0,0 @@
-"""Tests for QualityMetric."""
-
-from unittest.mock import MagicMock
-
-import pytest
-
-from dialoguekit.utils.dialogue_reader import json_to_dialogues
-
-from usersimcrs.evaluation.quality_metric import QualityMetric
-from usersimcrs.evaluation.quality_rubrics import QualityRubrics
-from usersimcrs.llm_interfaces.llm_interface import LLMInterface
-
-
-@pytest.fixture
-def dialogues():
-    """Load test dialogues."""
-    return json_to_dialogues(
-        "tests/data/annotated_dialogues.json",
-        agent_ids=["Agent"],
-        user_ids=["User"],
-    )
-
-
-@pytest.fixture
-def mock_llm_interface():
-    """Mock LLM interface."""
-    return MagicMock(spec=LLMInterface)
-
-
-@pytest.fixture
-def quality_metric(mock_llm_interface):
-    """QualityMetric instance with mocked LLM."""
-    return QualityMetric(llm_interface=mock_llm_interface)
-
-
-def test_quality_metric_init(mock_llm_interface) -> None:
-    """Test QualityMetric initializes with correct name and LLM."""
-    metric = QualityMetric(llm_interface=mock_llm_interface)
-    assert metric.name == "quality"
-    assert metric.llm_interface is mock_llm_interface
-
-
-def test_quality_metric_custom_name(mock_llm_interface) -> None:
-    """Test QualityMetric accepts custom name."""
-    metric = QualityMetric(llm_interface=mock_llm_interface, name="custom")
-    assert metric.name == "custom"
-
-
-def test_get_prompt(quality_metric, dialogues) -> None:
-    """Test _get_prompt builds prompt with dialogue and rubric."""
-    dialogue = dialogues[0]
-    prompt = quality_metric._get_prompt(QualityRubrics.FLUENCY, dialogue)
-
-    assert "CONVERSATION HISTORY" in prompt
-    assert "USER: Utterance 1" in prompt
-    assert "ASSISTANT: Utterance 2" in prompt
-    assert "GRADING RUBRIC" in prompt
-    assert "Fluency" in prompt
-    assert '{"score"' in prompt
-
-
-def test_get_prompt_all_aspects(quality_metric, dialogues) -> None:
-    """Test _get_prompt works for every quality aspect."""
-    dialogue = dialogues[0]
-    for aspect in QualityRubrics:
-        prompt = quality_metric._get_prompt(aspect, dialogue)
-        assert aspect.value in prompt
-
-
-def test_evaluate_dialogue_valid_response(quality_metric, dialogues) -> None:
-    """Test evaluate_dialogue parses a valid LLM JSON response."""
-    quality_metric.llm_interface.get_llm_api_response.return_value = (
-        '{"score": 4, "score_explanation": "Good fluency."}'
-    )
-    score = quality_metric.evaluate_dialogue(dialogues[0], aspect="FLUENCY")
-    assert score == 4.0
-
-
-def test_evaluate_dialogue_all_aspects(quality_metric, dialogues) -> None:
-    """Test evaluate_dialogue succeeds for each aspect name."""
-    quality_metric.llm_interface.get_llm_api_response.return_value = (
-        '{"score": 3, "score_explanation": "Average."}'
-    )
-    for aspect in QualityRubrics:
-        score = quality_metric.evaluate_dialogue(
-            dialogues[0], aspect=aspect.name
-        )
-        assert score == 3.0
-
-
-def test_evaluate_dialogue_missing_score_key(quality_metric, dialogues) -> None:
-    """Test evaluate_dialogue returns 0.0 when 'score' key is missing."""
-    quality_metric.llm_interface.get_llm_api_response.return_value = (
-        '{"explanation": "No score field."}'
-    )
-    score = quality_metric.evaluate_dialogue(dialogues[0], aspect="FLUENCY")
-    assert score == 0.0
-
-
-def test_evaluate_dialogue_unknown_aspect(quality_metric, dialogues) -> None:
-    """Test evaluate_dialogue raises KeyError for unsupported aspect."""
-    with pytest.raises(KeyError, match="Unknown aspect"):
-        quality_metric.evaluate_dialogue(dialogues[0], aspect="NONEXISTENT")
-
-
-def test_evaluate_dialogues(quality_metric, dialogues) -> None:
-    """Test evaluate_dialogues returns scores keyed by conversation ID."""
-    quality_metric.llm_interface.get_llm_api_response.return_value = (
-        '{"score": 5, "score_explanation": "Excellent."}'
-    )
-    results = quality_metric.evaluate_dialogues(dialogues, aspect="OVERALL_SAT")
-    assert len(results) == len(dialogues)
-    for dialogue in dialogues:
-        assert dialogue.conversation_id in results
-        assert results[dialogue.conversation_id] == 5.0
diff --git a/tests/evaluation/test_satisfaction_metric.py b/tests/evaluation/test_satisfaction_metric.py
deleted file mode 100644
index bd857060..00000000
--- a/tests/evaluation/test_satisfaction_metric.py
+++ /dev/null
@@ -1,80 +0,0 @@
-"""Tests for SatisfactionMetric."""
-
-from unittest.mock import MagicMock
-
-import pytest
-
-from dialoguekit.nlu.models.satisfaction_classifier import (
-    SatisfactionClassifier,
-)
-from dialoguekit.utils.dialogue_reader import json_to_dialogues
-
-from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric
-
-
-@pytest.fixture
-def dialogues():
-    """Load test dialogues."""
-    return json_to_dialogues(
-        "tests/data/annotated_dialogues.json",
-        agent_ids=["Agent"],
-        user_ids=["User"],
-    )
-
-
-@pytest.fixture
-def mock_classifier():
-    """Mock satisfaction classifier."""
-    return MagicMock(spec=SatisfactionClassifier)
-
-
-@pytest.fixture
-def satisfaction_metric(mock_classifier):
-    """SatisfactionMetric instance with mocked classifier."""
-    return SatisfactionMetric(classifier=mock_classifier)
-
-
-def test_satisfaction_metric_init(mock_classifier) -> None:
-    """Test SatisfactionMetric initializes with correct name."""
-    metric = SatisfactionMetric(classifier=mock_classifier)
-    assert metric.name == "satisfaction"
-    assert metric.classifier is mock_classifier
-
-
-def test_satisfaction_metric_custom_name(mock_classifier) -> None:
-    """Test SatisfactionMetric accepts custom name."""
-    metric = SatisfactionMetric(classifier=mock_classifier, name="sat_v2")
-    assert metric.name == "sat_v2"
-
-
-def test_evaluate_dialogue(satisfaction_metric, dialogues) -> None:
-    """Test evaluate_dialogue returns classifier score as float."""
-    satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 4
-    score = satisfaction_metric.evaluate_dialogue(dialogues[0])
-    assert score == 4.0
-    classify = satisfaction_metric.classifier.classify_last_n_dialogue
-    classify.assert_called_once_with(dialogues[0], last_n=None)
-
-
-def test_evaluate_dialogue_low_score(satisfaction_metric, dialogues) -> None:
-    """Test evaluate_dialogue with a low satisfaction score."""
-    satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 1
-    score = satisfaction_metric.evaluate_dialogue(dialogues[0])
-    assert score == 1.0
-
-
-def test_evaluate_dialogue_float_score(satisfaction_metric, dialogues) -> None:
-    """Test evaluate_dialogue handles fractional classifier output."""
-    satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 3.7
-    score = satisfaction_metric.evaluate_dialogue(dialogues[0])
-    assert score == pytest.approx(3.7)
-
-
-def test_evaluate_dialogues(satisfaction_metric, dialogues) -> None:
-    """Test evaluate_dialogues returns scores keyed by conversation ID."""
-    satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 3
-    results = satisfaction_metric.evaluate_dialogues(dialogues)
-    assert len(results) == len(dialogues)
-    for dialogue in dialogues:
-        assert dialogue.conversation_id in results
-        assert results[dialogue.conversation_id] == 3.0
diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py
deleted file mode 100644
index aff046bc..00000000
--- a/tests/evaluation/test_utility_metric.py
+++ /dev/null
@@ -1,145 +0,0 @@
-"""Tests for utility metric classes."""
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from dialoguekit.utils.dialogue_reader import json_to_dialogues
-
-from usersimcrs.evaluation.reward_per_dialogue_length_metric import (
-    RewardPerDialogueLengthMetric,
-)
-from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric
-from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import (
-    SuccessfulRecommendationRoundRatioMetric,
-)
-
-_LOAD_NLU_PATH = "usersimcrs.evaluation.utility_base_metric.load_nlu"
-_ANNOTATE_PATH = "usersimcrs.evaluation.utility_base_metric.annotate_dialogue"
-
-
-@pytest.fixture
-def dialogues():
-    """Load test dialogues."""
-    return json_to_dialogues(
-        "tests/data/annotated_dialogues.json",
-        agent_ids=["Agent"],
-        user_ids=["User"],
-    )
-
-
-@pytest.fixture
-def success_rate_metric():
-    return SuccessRateMetric(
-        user_nlu_config_path="dummy_user_nlu.yaml",
-        agent_nlu_config_path="dummy_agent_nlu.yaml",
-    )
-
-
-def test_success_rate_init() -> None:
-    """Test SuccessRateMetric default and custom name."""
-    metric = SuccessRateMetric()
-    assert metric.name == "success_rate"
-
-    metric = SuccessRateMetric(name="custom_sr")
-    assert metric.name == "custom_sr"
-
-
-def test_success_rate_evaluate_dialogue(
-    success_rate_metric: SuccessRateMetric, dialogues
-) -> None:
-    """Test SuccessRateMetric returns 1.0 for accepted dialogue."""
-    dialogue = dialogues[0]
-    mock_nlu = MagicMock()
-    with (
-        patch(_LOAD_NLU_PATH, return_value=mock_nlu),
-        patch(_ANNOTATE_PATH),
-        patch.object(
-            SuccessRateMetric,
-            "_assess_dialogue",
-            return_value=True,
-        ),
-    ):
-        assert success_rate_metric.evaluate_dialogue(dialogue) == 1.0
-
-
-def test_success_rate_without_nlu_paths(dialogues) -> None:
-    """Test SuccessRateMetric works on pre-annotated dialogues."""
-    metric = SuccessRateMetric()
-    dialogue = dialogues[0]
-    with patch.object(SuccessRateMetric, "_assess_dialogue", return_value=True):
-        assert metric.evaluate_dialogue(dialogue) == 1.0
-
-
-@pytest.fixture
-def srrr_metric():
-    return SuccessfulRecommendationRoundRatioMetric()
-
-
-def test_srrr_init() -> None:
-    """Test SRRR metric default and custom name."""
-    metric = SuccessfulRecommendationRoundRatioMetric()
-    assert metric.name == "successful_recommendation_round_ratio"
-
-    metric = SuccessfulRecommendationRoundRatioMetric(name="srrr_v2")
-    assert metric.name == "srrr_v2"
-
-
-def test_srrr_evaluate_dialogue(srrr_metric, dialogues) -> None:
-    """Test SRRR returns correct ratio."""
-    dialogue = dialogues[0]
-    with patch.object(
-        SuccessfulRecommendationRoundRatioMetric,
-        "_assess_dialogue",
-        return_value=(1, 2),
-    ):
-        assert srrr_metric.evaluate_dialogue(dialogue) == 0.5
-
-
-def test_srrr_all_rounds_successful(srrr_metric, dialogues) -> None:
-    """Test SRRR returns 1.0 when all rounds accepted."""
-    dialogue = dialogues[0]
-    with patch.object(
-        SuccessfulRecommendationRoundRatioMetric,
-        "_assess_dialogue",
-        return_value=(3, 3),
-    ):
-        assert srrr_metric.evaluate_dialogue(dialogue) == 1.0
-
-
-def test_srrr_no_successful_rounds(srrr_metric, dialogues) -> None:
-    """Test SRRR returns 0.0 when no rounds are accepted."""
-    dialogue = dialogues[0]
-    with patch.object(
-        SuccessfulRecommendationRoundRatioMetric,
-        "_assess_dialogue",
-        return_value=(0, 4),
-    ):
-        assert srrr_metric.evaluate_dialogue(dialogue) == 0.0
-
-
-@pytest.fixture
-def rdl_metric():
-    return RewardPerDialogueLengthMetric()
-
-
-def test_rdl_no_accepted(rdl_metric, dialogues) -> None:
-    """Test RDL returns 0.0 when no recommendations accepted."""
-    dialogue = dialogues[0]
-    with patch.object(
-        RewardPerDialogueLengthMetric,
-        "_assess_dialogue",
-        return_value=(0, 7),
-    ):
-        assert rdl_metric.evaluate_dialogue(dialogue) == 0.0
-
-
-def test_rdl_multiple_accepted(rdl_metric, dialogues) -> None:
-    """Test RDL with several accepted recommendations."""
-    dialogue = dialogues[0]
-    with patch.object(
-        RewardPerDialogueLengthMetric,
-        "_assess_dialogue",
-        return_value=(3, 10),
-    ):
-        assert rdl_metric.evaluate_dialogue(dialogue) == pytest.approx(0.3)
diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py
index 4d4ec9c6..fbefe441 100644
--- a/usersimcrs/evaluation/dialogue_annotation.py
+++ b/usersimcrs/evaluation/dialogue_annotation.py
@@ -1,11 +1,11 @@
 """Dialogue annotation and recommendation round utilities.
 
 Provides functions for annotating dialogues with dialogue acts using NLU
-modules, parsing intent labels, and extracting recommendation rounds from
-annotated dialogues.
+modules, extracting recommendation rounds from annotated dialogues, and
+assessing recommendation acceptance.
 """
 
-from typing import Any, List, Optional, Tuple
+from typing import Dict, List, Optional, Sequence, Tuple
 
 from confuse import Configuration
 
@@ -18,6 +18,27 @@
 from usersimcrs.utils.simulation_utils import get_NLU
 
 
+_intent_cache: Dict[Tuple[str, ...], List[Intent]] = {}
+
+
+def resolve_intents(
+    labels: Optional[Sequence[str]], defaults: List[str]
+) -> List[Intent]:
+    """Resolves optional label overrides to a cached list of Intents.
+
+    Args:
+        labels: Custom labels or None to use defaults.
+        defaults: Default label strings.
+
+    Returns:
+        Cached list of Intent objects.
+    """
+    key = tuple(labels if labels is not None else defaults)
+    if key not in _intent_cache:
+        _intent_cache[key] = [Intent(label) for label in key]
+    return _intent_cache[key]
+
+
 def annotate_dialogue(
     dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
 ) -> Dialogue:
@@ -62,56 +83,22 @@ def annotate_dialogue(
 def load_nlu(
     nlu_config_path: str,
     config_name: str = "NLU Configuration",
-    cached_nlu: Optional[NLU] = None,
 ) -> NLU:
-    """Loads a single NLU module.
-
-    Returns the cached instance when provided, otherwise creates a new one
-    from the given configuration file.
+    """Loads a single NLU module from the given configuration file.
 
     Args:
         nlu_config_path: Path to the NLU configuration file.
-        config_name: Name for the Configuration instance.
-        cached_nlu: Previously loaded NLU module.
+        config_name: Name for the Configuration instance. Defaults to
+            ``"NLU Configuration"``.
 
     Returns:
         NLU module.
     """
-    if cached_nlu is not None:
-        return cached_nlu
     nlu_config = Configuration(config_name)
     nlu_config.set_file(nlu_config_path)
     return get_NLU(nlu_config)
 
 
-def get_intent_lists(
-    **kwargs: Any,
-) -> Tuple[List[Intent], List[Intent], List[Intent]]:
-    """Builds recommendation, acceptance, and rejection intent lists.
-
-    Args:
-        **kwargs: Optional intent label overrides:
-            - recommendation_intent_labels: Labels for recommendation intents.
-              Defaults to ``["REC-S", "REC-E"]``.
-            - acceptance_intent_labels: Labels for acceptance intents.
-              Defaults to ``["ACC"]``.
-            - rejection_intent_labels: Labels for rejection intents.
-              Defaults to ``["REJ"]``.
-
-    Returns:
-        Tuple of (recommendation_intents, acceptance_intents,
-        rejection_intents).
-    """
-    rec_labels = kwargs.get("recommendation_intent_labels", ["REC-S", "REC-E"])
-    acc_labels = kwargs.get("acceptance_intent_labels", ["ACC"])
-    rej_labels = kwargs.get("rejection_intent_labels", ["REJ"])
-    return (
-        [Intent(label) for label in rec_labels],
-        [Intent(label) for label in acc_labels],
-        [Intent(label) for label in rej_labels],
-    )
-
-
 def annotate_dialogues(
     dialogues: List[Dialogue],
     user_nlu_config_path: str,
diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
index aca86dc6..e28de982 100644
--- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
+++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
@@ -3,16 +3,18 @@
 Evaluates the ratio of accepted recommendations to total dialogue length.
 """
 
-from typing import Any, List, Tuple
+from typing import Any, List, Optional
 
 from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.core.intent import Intent
 from dialoguekit.participant.participant import DialogueParticipant
+from usersimcrs.evaluation.dialogue_annotation import (
+    resolve_intents,
+)
 
-from usersimcrs.evaluation.utility_base_metric import UtilityBaseMetric
+from usersimcrs.evaluation.utility_base import DEFAULT_ACC_LABELS, UtilityBase
 
 
-class RewardPerDialogueLengthMetric(UtilityBaseMetric):
+class RewardPerDialogueLengthMetric(UtilityBase):
     def __init__(
         self,
         name: str = "reward_per_dialogue_length",
@@ -20,53 +22,32 @@ def __init__(
         """Initializes the reward-per-dialogue-length metric.
 
         Args:
-            user_nlu_config_path: Path to user NLU configuration.
-            agent_nlu_config_path: Path to agent NLU configuration.
             name: Metric name.
         """
         super().__init__(name)
 
-    def _assess_dialogue(
-        self, dialogue: Dialogue, acceptance_intents: List[Intent]
-    ) -> Tuple[int, int]:
-        """Returns accepted recommendations and dialogue length.
-
-        Args:
-            dialogue: Annotated dialogue.
-            acceptance_intents: Intents that signal acceptance.
-
-        Returns:
-            Tuple of (accepted_recommendations, dialogue_length).
-        """
-        nb_accepted_recommendations = sum(
-            1
-            for utterance in dialogue.utterances
-            if utterance.participant == DialogueParticipant.USER
-            and any(
-                intent in acceptance_intents
-                for intent in utterance.get_intents()
-            )
-        )
-        return nb_accepted_recommendations, len(dialogue.utterances)
-
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+    def evaluate_dialogue(
+        self,
+        dialogue: Dialogue,
+        acceptance_intent_labels: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> float:
         """Computes the reward-per-dialogue-length score.
 
         Args:
             dialogue: Dialogue to evaluate.
-            **kwargs: Optional intent label overrides:
-                - recommendation_intent_labels: Labels for recommendation
-                  intents. Defaults to ["REC-S", "REC-E"].
-                - acceptance_intent_labels: Labels for acceptance intents.
-                  Defaults to ["ACC"].
-                - rejection_intent_labels: Labels for rejection intents.
-                  Defaults to ["REJ"].
+            acceptance_intent_labels: Labels for acceptance intents.
+                Defaults to ``["ACC"]``.
 
         Returns:
             Ratio of accepted recommendations to total utterances.
         """
-        _, acc, _ = self._resolve_intents(dialogue=dialogue, **kwargs)
-        nb_accepted, dialogue_length = self._assess_dialogue(
-            dialogue=dialogue, acceptance_intents=acc
+        self._annotate_if_needed(dialogue)
+        acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS)
+        nb_accepted = sum(
+            1
+            for utterance in dialogue.utterances
+            if utterance.participant == DialogueParticipant.USER
+            and any(intent in acc for intent in utterance.get_intents())
         )
-        return nb_accepted / dialogue_length
+        return nb_accepted / len(dialogue.utterances)
diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py
index e0bb9566..f9926206 100644
--- a/usersimcrs/evaluation/success_rate_metric.py
+++ b/usersimcrs/evaluation/success_rate_metric.py
@@ -6,16 +6,21 @@
 from typing import Any, List, Optional
 
 from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.core.intent import Intent
 
 from usersimcrs.evaluation.dialogue_annotation import (
     get_recommendation_rounds,
     is_recommendation_accepted,
+    resolve_intents,
+)
+from usersimcrs.evaluation.utility_base import (
+    DEFAULT_ACC_LABELS,
+    DEFAULT_REC_LABELS,
+    DEFAULT_REJ_LABELS,
+    UtilityBase,
 )
-from usersimcrs.evaluation.utility_base_metric import UtilityBaseMetric
 
 
-class SuccessRateMetric(UtilityBaseMetric):
+class SuccessRateMetric(UtilityBase):
     def __init__(
         self,
         user_nlu_config_path: Optional[str] = None,
@@ -35,54 +40,33 @@ def __init__(
             agent_nlu_config_path=agent_nlu_config_path,
         )
 
-    def _assess_dialogue(
+    def evaluate_dialogue(
         self,
         dialogue: Dialogue,
-        recommendation_intents: List[Intent],
-        acceptance_intents: List[Intent],
-        rejection_intents: List[Intent],
-    ) -> bool:
-        """Checks whether at least one recommendation round was accepted.
-
-        Args:
-            dialogue: Annotated dialogue.
-            recommendation_intents: Intents that signal a recommendation.
-            acceptance_intents: Intents that signal acceptance.
-            rejection_intents: Intents that signal rejection.
-
-        Returns:
-            True if at least one round was accepted, False otherwise.
-        """
-        rounds = get_recommendation_rounds(dialogue, recommendation_intents)
-        return any(
-            is_recommendation_accepted(
-                round_utterances, acceptance_intents, rejection_intents
-            )
-            for round_utterances in rounds
-        )
-
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        recommendation_intent_labels: Optional[List[str]] = None,
+        acceptance_intent_labels: Optional[List[str]] = None,
+        rejection_intent_labels: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> float:
         """Computes the success rate for a single dialogue.
 
         Args:
             dialogue: Dialogue to evaluate.
-            **kwargs: Optional intent label overrides:
-                - recommendation_intent_labels: Labels for recommendation
-                  intents. Defaults to ["REC-S", "REC-E"].
-                - acceptance_intent_labels: Labels for acceptance intents.
-                  Defaults to ["ACC"].
-                - rejection_intent_labels: Labels for rejection intents.
-                  Defaults to ["REJ"].
+            recommendation_intent_labels: Labels for recommendation intents.
+                Defaults to ``["REC-S", "REC-E"]``.
+            acceptance_intent_labels: Labels for acceptance intents.
+                Defaults to ``["ACC"]``.
+            rejection_intent_labels: Labels for rejection intents.
+                Defaults to ``["REJ"]``.
 
         Returns:
             1.0 if at least one recommendation was accepted, 0.0 otherwise.
         """
-        rec, acc, rej = self._resolve_intents(dialogue=dialogue, **kwargs)
+        self._annotate_if_needed(dialogue)
+        rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS)
+        acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS)
+        rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS)
+        rounds = get_recommendation_rounds(dialogue, rec)
         return float(
-            self._assess_dialogue(
-                dialogue=dialogue,
-                recommendation_intents=rec,
-                acceptance_intents=acc,
-                rejection_intents=rej,
-            )
+            any(is_recommendation_accepted(r, acc, rej) for r in rounds)
         )
diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
index 09295572..471527fe 100644
--- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
+++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
@@ -4,19 +4,24 @@
 rounds in a dialogue.
 """
 
-from typing import Any, List, Tuple
+from typing import Any, List, Optional
 
 from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.core.intent import Intent
 
 from usersimcrs.evaluation.dialogue_annotation import (
     get_recommendation_rounds,
     is_recommendation_accepted,
+    resolve_intents,
+)
+from usersimcrs.evaluation.utility_base import (
+    DEFAULT_ACC_LABELS,
+    DEFAULT_REC_LABELS,
+    DEFAULT_REJ_LABELS,
+    UtilityBase,
 )
-from usersimcrs.evaluation.utility_base_metric import UtilityBaseMetric
 
 
-class SuccessfulRecommendationRoundRatioMetric(UtilityBaseMetric):
+class SuccessfulRecommendationRoundRatioMetric(UtilityBase):
     def __init__(
         self,
         name: str = "successful_recommendation_round_ratio",
@@ -24,64 +29,39 @@ def __init__(
         """Initializes the successful recommendation round ratio metric.
 
         Args:
-            user_nlu_config_path: Path to user NLU configuration.
-            agent_nlu_config_path: Path to agent NLU configuration.
             name: Metric name.
         """
-        super().__init__(
-            name,
-        )
+        super().__init__(name)
 
-    def _assess_dialogue(
+    def evaluate_dialogue(
         self,
         dialogue: Dialogue,
-        recommendation_intents: List[Intent],
-        acceptance_intents: List[Intent],
-        rejection_intents: List[Intent],
-    ) -> Tuple[int, int]:
-        """Returns successful and total recommendation rounds.
-
-        Args:
-            dialogue: Annotated dialogue.
-            recommendation_intents: Intents that signal a recommendation.
-            acceptance_intents: Intents that signal acceptance.
-            rejection_intents: Intents that signal rejection.
-
-        Returns:
-            Tuple of (successful_rounds, total_rounds).
-        """
-        rounds = get_recommendation_rounds(dialogue, recommendation_intents)
-        successful_rounds = sum(
-            1
-            for round_utterances in rounds
-            if is_recommendation_accepted(
-                round_utterances, acceptance_intents, rejection_intents
-            )
-        )
-        return successful_rounds, len(rounds)
-
-    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        recommendation_intent_labels: Optional[List[str]] = None,
+        acceptance_intent_labels: Optional[List[str]] = None,
+        rejection_intent_labels: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> float:
         """Computes the successful recommendation round ratio.
 
         Args:
             dialogue: Dialogue to evaluate.
-            **kwargs: Optional intent label overrides:
-                - recommendation_intent_labels: Labels for recommendation
-                  intents. Defaults to ["REC-S", "REC-E"].
-                - acceptance_intent_labels: Labels for acceptance intents.
-                  Defaults to ["ACC"].
-                - rejection_intent_labels: Labels for rejection intents.
-                  Defaults to ["REJ"].
+            recommendation_intent_labels: Labels for recommendation intents.
+                Defaults to ``["REC-S", "REC-E"]``.
+            acceptance_intent_labels: Labels for acceptance intents.
+                Defaults to ``["ACC"]``.
+            rejection_intent_labels: Labels for rejection intents.
+                Defaults to ``["REJ"]``.
 
         Returns:
             Ratio of accepted recommendation rounds to total rounds,
             or 0.0 if there are no recommendation rounds.
         """
-        rec, acc, rej = self._resolve_intents(dialogue=dialogue, **kwargs)
-        successful_rounds, total_rounds = self._assess_dialogue(
-            dialogue=dialogue,
-            recommendation_intents=rec,
-            acceptance_intents=acc,
-            rejection_intents=rej,
+        self._annotate_if_needed(dialogue)
+        rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS)
+        acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS)
+        rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS)
+        rounds = get_recommendation_rounds(dialogue, rec)
+        successful = sum(
+            1 for r in rounds if is_recommendation_accepted(r, acc, rej)
         )
-        return successful_rounds / total_rounds if total_rounds > 0 else 0.0
+        return successful / len(rounds) if rounds else 0.0
diff --git a/usersimcrs/evaluation/utility_base.py b/usersimcrs/evaluation/utility_base.py
new file mode 100644
index 00000000..7f87a754
--- /dev/null
+++ b/usersimcrs/evaluation/utility_base.py
@@ -0,0 +1,56 @@
+"""Base class for utility-centric dialogue evaluation metrics.
+
+Provides shared NLU loading, and dialogue annotation.
+"""
+
+from abc import ABC
+from typing import Optional
+
+from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.nlu.nlu import NLU
+
+from usersimcrs.evaluation.base_metric import BaseMetric
+from usersimcrs.evaluation.dialogue_annotation import (
+    annotate_dialogue,
+    load_nlu,
+)
+
+DEFAULT_REC_LABELS = ["REC-S", "REC-E"]
+DEFAULT_ACC_LABELS = ["ACC"]
+DEFAULT_REJ_LABELS = ["REJ"]
+
+
+class UtilityBase(BaseMetric, ABC):
+    def __init__(
+        self,
+        name: str,
+        user_nlu_config_path: Optional[str] = None,
+        agent_nlu_config_path: Optional[str] = None,
+    ) -> None:
+        """Initializes the utility metric.
+
+        Args:
+            name: Metric name.
+            user_nlu_config_path: Path to user NLU configuration.
+            agent_nlu_config_path: Path to agent NLU configuration.
+        """
+        super().__init__(name)
+        self._user_nlu_config_path = user_nlu_config_path
+        self._agent_nlu_config_path = agent_nlu_config_path
+        self._user_nlu: Optional[NLU] = None
+        self._agent_nlu: Optional[NLU] = None
+
+    def _annotate_if_needed(self, dialogue: Dialogue) -> None:
+        """Annotates the dialogue with NLU if config paths are set."""
+        if self._user_nlu_config_path and self._agent_nlu_config_path:
+            if self._user_nlu is None:
+                self._user_nlu = load_nlu(
+                    self._user_nlu_config_path,
+                    "User NLU Configuration",
+                )
+            if self._agent_nlu is None:
+                self._agent_nlu = load_nlu(
+                    self._agent_nlu_config_path,
+                    "Agent NLU Configuration",
+                )
+            annotate_dialogue(dialogue, self._user_nlu, self._agent_nlu)
diff --git a/usersimcrs/evaluation/utility_base_metric.py b/usersimcrs/evaluation/utility_base_metric.py
deleted file mode 100644
index c8a53b29..00000000
--- a/usersimcrs/evaluation/utility_base_metric.py
+++ /dev/null
@@ -1,70 +0,0 @@
-"""Base class for dialogue annotation support."""
-
-from abc import ABC
-from typing import Any, List, Optional, Tuple
-
-from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.core.intent import Intent
-from dialoguekit.nlu.nlu import NLU
-
-from usersimcrs.evaluation.base_metric import BaseMetric
-from usersimcrs.evaluation.dialogue_annotation import (
-    annotate_dialogue,
-    get_intent_lists,
-    load_nlu,
-)
-
-
-class UtilityBaseMetric(BaseMetric, ABC):
-    """Shared base for metrics that optionally annotate dialogues via NLU.
-
-    When NLU config paths are provided, dialogues are annotated automatically.
-    When omitted, dialogues must be pre-annotated.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        user_nlu_config_path: Optional[str] = None,
-        agent_nlu_config_path: Optional[str] = None,
-    ) -> None:
-        """Initializes the utility metric.
-
-        Args:
-            name: Metric name.
-            user_nlu_config_path: Path to user NLU configuration.
-            agent_nlu_config_path: Path to agent NLU configuration.
-        """
-        super().__init__(name)
-        self._user_nlu_config_path = user_nlu_config_path
-        self._agent_nlu_config_path = agent_nlu_config_path
-        self._user_nlu: Optional[NLU] = None
-        self._agent_nlu: Optional[NLU] = None
-
-    def _resolve_intents(
-        self, dialogue: Dialogue, **kwargs: Any
-    ) -> Tuple[List[Intent], List[Intent], List[Intent]]:
-        """Annotates the dialogue (if NLU paths are set) and returns intents.
-
-        Args:
-            dialogue: Dialogue to prepare.
-            **kwargs: Optional intent label overrides forwarded to
-                :func:`get_intent_lists`.
-
-        Returns:
-            Tuple of (recommendation_intents, acceptance_intents,
-            rejection_intents).
-        """
-        if self._user_nlu_config_path and self._agent_nlu_config_path:
-            self._user_nlu = load_nlu(
-                self._user_nlu_config_path,
-                "User NLU Configuration",
-                self._user_nlu,
-            )
-            self._agent_nlu = load_nlu(
-                self._agent_nlu_config_path,
-                "Agent NLU Configuration",
-                self._agent_nlu,
-            )
-            annotate_dialogue(dialogue, self._user_nlu, self._agent_nlu)
-        return get_intent_lists(**kwargs)

From f006930c957aa8e33f4e21702fb3b48bdd5b7726 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 17 Mar 2026 13:32:37 +0100
Subject: [PATCH 31/38] remove utility class

---
 usersimcrs/evaluation/dialogue_annotation.py  | 36 ++++++++++++
 .../reward_per_dialogue_length_metric.py      |  8 +--
 usersimcrs/evaluation/success_rate_metric.py  | 27 ++++-----
 ...ssful_recommendation_round_ratio_metric.py | 13 ++---
 usersimcrs/evaluation/utility_base.py         | 56 -------------------
 5 files changed, 59 insertions(+), 81 deletions(-)
 delete mode 100644 usersimcrs/evaluation/utility_base.py

diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py
index fbefe441..12e4a573 100644
--- a/usersimcrs/evaluation/dialogue_annotation.py
+++ b/usersimcrs/evaluation/dialogue_annotation.py
@@ -18,7 +18,12 @@
 from usersimcrs.utils.simulation_utils import get_NLU
 
 
+DEFAULT_REC_LABELS = ["REC-S", "REC-E"]
+DEFAULT_ACC_LABELS = ["ACC"]
+DEFAULT_REJ_LABELS = ["REJ"]
+
 _intent_cache: Dict[Tuple[str, ...], List[Intent]] = {}
+_nlu_cache: Dict[str, NLU] = {}
 
 
 def resolve_intents(
@@ -39,6 +44,37 @@ def resolve_intents(
     return _intent_cache[key]
 
 
+def annotate_if_needed(
+    dialogue: Dialogue,
+    user_nlu_config_path: Optional[str] = None,
+    agent_nlu_config_path: Optional[str] = None,
+) -> None:
+    """Annotates the dialogue with NLU if config paths are provided.
+
+    NLU modules are loaded lazily and cached by config path.
+
+    Args:
+        dialogue: Dialogue to annotate.
+        user_nlu_config_path: Path to user NLU configuration.
+        agent_nlu_config_path: Path to agent NLU configuration.
+    """
+    if not user_nlu_config_path or not agent_nlu_config_path:
+        return
+    if user_nlu_config_path not in _nlu_cache:
+        _nlu_cache[user_nlu_config_path] = load_nlu(
+            user_nlu_config_path, "User NLU Configuration"
+        )
+    if agent_nlu_config_path not in _nlu_cache:
+        _nlu_cache[agent_nlu_config_path] = load_nlu(
+            agent_nlu_config_path, "Agent NLU Configuration"
+        )
+    annotate_dialogue(
+        dialogue,
+        _nlu_cache[user_nlu_config_path],
+        _nlu_cache[agent_nlu_config_path],
+    )
+
+
 def annotate_dialogue(
     dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU
 ) -> Dialogue:
diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
index e28de982..1fcb30b1 100644
--- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
+++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
@@ -7,14 +7,15 @@
 
 from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.participant.participant import DialogueParticipant
+
+from usersimcrs.evaluation.base_metric import BaseMetric
 from usersimcrs.evaluation.dialogue_annotation import (
+    DEFAULT_ACC_LABELS,
     resolve_intents,
 )
 
-from usersimcrs.evaluation.utility_base import DEFAULT_ACC_LABELS, UtilityBase
-
 
-class RewardPerDialogueLengthMetric(UtilityBase):
+class RewardPerDialogueLengthMetric(BaseMetric):
     def __init__(
         self,
         name: str = "reward_per_dialogue_length",
@@ -42,7 +43,6 @@ def evaluate_dialogue(
         Returns:
             Ratio of accepted recommendations to total utterances.
         """
-        self._annotate_if_needed(dialogue)
         acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS)
         nb_accepted = sum(
             1
diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py
index f9926206..4d6e9d86 100644
--- a/usersimcrs/evaluation/success_rate_metric.py
+++ b/usersimcrs/evaluation/success_rate_metric.py
@@ -7,20 +7,19 @@
 
 from dialoguekit.core.dialogue import Dialogue
 
+from usersimcrs.evaluation.base_metric import BaseMetric
 from usersimcrs.evaluation.dialogue_annotation import (
-    get_recommendation_rounds,
-    is_recommendation_accepted,
-    resolve_intents,
-)
-from usersimcrs.evaluation.utility_base import (
     DEFAULT_ACC_LABELS,
     DEFAULT_REC_LABELS,
     DEFAULT_REJ_LABELS,
-    UtilityBase,
+    annotate_if_needed,
+    get_recommendation_rounds,
+    is_recommendation_accepted,
+    resolve_intents,
 )
 
 
-class SuccessRateMetric(UtilityBase):
+class SuccessRateMetric(BaseMetric):
     def __init__(
         self,
         user_nlu_config_path: Optional[str] = None,
@@ -34,11 +33,9 @@ def __init__(
             agent_nlu_config_path: Path to agent NLU configuration.
             name: Metric name.
         """
-        super().__init__(
-            name,
-            user_nlu_config_path=user_nlu_config_path,
-            agent_nlu_config_path=agent_nlu_config_path,
-        )
+        super().__init__(name)
+        self._user_nlu_config_path = user_nlu_config_path
+        self._agent_nlu_config_path = agent_nlu_config_path
 
     def evaluate_dialogue(
         self,
@@ -62,7 +59,11 @@ def evaluate_dialogue(
         Returns:
             1.0 if at least one recommendation was accepted, 0.0 otherwise.
         """
-        self._annotate_if_needed(dialogue)
+        annotate_if_needed(
+            dialogue,
+            self._user_nlu_config_path,
+            self._agent_nlu_config_path,
+        )
         rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS)
         acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS)
         rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS)
diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
index 471527fe..c5fe6fe6 100644
--- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
+++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
@@ -8,20 +8,18 @@
 
 from dialoguekit.core.dialogue import Dialogue
 
+from usersimcrs.evaluation.base_metric import BaseMetric
 from usersimcrs.evaluation.dialogue_annotation import (
-    get_recommendation_rounds,
-    is_recommendation_accepted,
-    resolve_intents,
-)
-from usersimcrs.evaluation.utility_base import (
     DEFAULT_ACC_LABELS,
     DEFAULT_REC_LABELS,
     DEFAULT_REJ_LABELS,
-    UtilityBase,
+    get_recommendation_rounds,
+    is_recommendation_accepted,
+    resolve_intents,
 )
 
 
-class SuccessfulRecommendationRoundRatioMetric(UtilityBase):
+class SuccessfulRecommendationRoundRatioMetric(BaseMetric):
     def __init__(
         self,
         name: str = "successful_recommendation_round_ratio",
@@ -56,7 +54,6 @@ def evaluate_dialogue(
             Ratio of accepted recommendation rounds to total rounds,
             or 0.0 if there are no recommendation rounds.
         """
-        self._annotate_if_needed(dialogue)
         rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS)
         acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS)
         rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS)
diff --git a/usersimcrs/evaluation/utility_base.py b/usersimcrs/evaluation/utility_base.py
deleted file mode 100644
index 7f87a754..00000000
--- a/usersimcrs/evaluation/utility_base.py
+++ /dev/null
@@ -1,56 +0,0 @@
-"""Base class for utility-centric dialogue evaluation metrics.
-
-Provides shared NLU loading, and dialogue annotation.
-"""
-
-from abc import ABC
-from typing import Optional
-
-from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.nlu.nlu import NLU
-
-from usersimcrs.evaluation.base_metric import BaseMetric
-from usersimcrs.evaluation.dialogue_annotation import (
-    annotate_dialogue,
-    load_nlu,
-)
-
-DEFAULT_REC_LABELS = ["REC-S", "REC-E"]
-DEFAULT_ACC_LABELS = ["ACC"]
-DEFAULT_REJ_LABELS = ["REJ"]
-
-
-class UtilityBase(BaseMetric, ABC):
-    def __init__(
-        self,
-        name: str,
-        user_nlu_config_path: Optional[str] = None,
-        agent_nlu_config_path: Optional[str] = None,
-    ) -> None:
-        """Initializes the utility metric.
-
-        Args:
-            name: Metric name.
-            user_nlu_config_path: Path to user NLU configuration.
-            agent_nlu_config_path: Path to agent NLU configuration.
-        """
-        super().__init__(name)
-        self._user_nlu_config_path = user_nlu_config_path
-        self._agent_nlu_config_path = agent_nlu_config_path
-        self._user_nlu: Optional[NLU] = None
-        self._agent_nlu: Optional[NLU] = None
-
-    def _annotate_if_needed(self, dialogue: Dialogue) -> None:
-        """Annotates the dialogue with NLU if config paths are set."""
-        if self._user_nlu_config_path and self._agent_nlu_config_path:
-            if self._user_nlu is None:
-                self._user_nlu = load_nlu(
-                    self._user_nlu_config_path,
-                    "User NLU Configuration",
-                )
-            if self._agent_nlu is None:
-                self._agent_nlu = load_nlu(
-                    self._agent_nlu_config_path,
-                    "Agent NLU Configuration",
-                )
-            annotate_dialogue(dialogue, self._user_nlu, self._agent_nlu)

From e0f3c56e8fa9bc93c7b6ea9c99c5e3996aae844d Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 17 Mar 2026 14:48:06 +0100
Subject: [PATCH 32/38] fix get_recommendation_rounds

---
 usersimcrs/evaluation/dialogue_annotation.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py
index 12e4a573..c6b4ec3c 100644
--- a/usersimcrs/evaluation/dialogue_annotation.py
+++ b/usersimcrs/evaluation/dialogue_annotation.py
@@ -170,16 +170,21 @@ def get_recommendation_rounds(
     """
     rounds: List[List[AnnotatedUtterance]] = []
     current_round: List[AnnotatedUtterance] = []
+    in_round = False
     for utterance in dialogue.utterances:
         if any(
             intent in utterance.get_intents()
             for intent in recommendation_intents
         ):
-            if current_round:
+            if in_round and current_round:
                 rounds.append(current_round)
             current_round = [utterance]
+            in_round = True
         else:
-            current_round.append(utterance)
+            if in_round:
+                current_round.append(utterance)
+    if in_round and current_round:
+        rounds.append(current_round)
     return rounds
 
 

From 4157fdc56a2b7d0d5d585ec4e5c08cca44f84308 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 17 Mar 2026 17:14:08 +0100
Subject: [PATCH 33/38] fixes

---
 usersimcrs/evaluation/dialogue_annotation.py  | 24 +++++++---
 .../reward_per_dialogue_length_metric.py      | 20 ++++----
 usersimcrs/evaluation/success_rate_metric.py  | 48 +++++++------------
 ...ssful_recommendation_round_ratio_metric.py | 38 +++++++--------
 4 files changed, 64 insertions(+), 66 deletions(-)

diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py
index c6b4ec3c..0039594f 100644
--- a/usersimcrs/evaluation/dialogue_annotation.py
+++ b/usersimcrs/evaluation/dialogue_annotation.py
@@ -44,6 +44,21 @@ def resolve_intents(
     return _intent_cache[key]
 
 
+DEFAULT_REC_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_REC_LABELS)
+DEFAULT_ACC_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_ACC_LABELS)
+DEFAULT_REJ_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_REJ_LABELS)
+
+
+def ensure_dialogue_is_annotated(dialogue: Dialogue) -> None:
+    """Raises if a dialogue is not annotated with annotated utterances."""
+    for utterance in dialogue.utterances:
+        if not isinstance(utterance, AnnotatedUtterance):
+            raise ValueError(
+                "Dialogue must be annotated (utterances must be "
+                "`AnnotatedUtterance`)."
+            )
+
+
 def annotate_if_needed(
     dialogue: Dialogue,
     user_nlu_config_path: Optional[str] = None,
@@ -170,20 +185,17 @@ def get_recommendation_rounds(
     """
     rounds: List[List[AnnotatedUtterance]] = []
     current_round: List[AnnotatedUtterance] = []
-    in_round = False
     for utterance in dialogue.utterances:
         if any(
             intent in utterance.get_intents()
             for intent in recommendation_intents
         ):
-            if in_round and current_round:
+            if current_round:
                 rounds.append(current_round)
             current_round = [utterance]
-            in_round = True
         else:
-            if in_round:
-                current_round.append(utterance)
-    if in_round and current_round:
+            current_round.append(utterance)
+    if current_round:
         rounds.append(current_round)
     return rounds
 
diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
index 1fcb30b1..c08881b1 100644
--- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
+++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
@@ -3,15 +3,15 @@
 Evaluates the ratio of accepted recommendations to total dialogue length.
 """
 
-from typing import Any, List, Optional
+from typing import Any, List
 
 from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
 from dialoguekit.participant.participant import DialogueParticipant
 
 from usersimcrs.evaluation.base_metric import BaseMetric
 from usersimcrs.evaluation.dialogue_annotation import (
-    DEFAULT_ACC_LABELS,
-    resolve_intents,
+    ensure_dialogue_is_annotated,
 )
 
 
@@ -23,31 +23,33 @@ def __init__(
         """Initializes the reward-per-dialogue-length metric.
 
         Args:
-            name: Metric name.
+            name: Metric name. Defaults to "reward_per_dialogue_length".
         """
         super().__init__(name)
 
     def evaluate_dialogue(
         self,
         dialogue: Dialogue,
-        acceptance_intent_labels: Optional[List[str]] = None,
+        acceptance_intents: List[Intent],
         **kwargs: Any,
     ) -> float:
         """Computes the reward-per-dialogue-length score.
 
         Args:
             dialogue: Dialogue to evaluate.
-            acceptance_intent_labels: Labels for acceptance intents.
-                Defaults to ``["ACC"]``.
+            acceptance_intents: Acceptance intents (e.g., ``[Intent("ACC")]``).
 
         Returns:
             Ratio of accepted recommendations to total utterances.
         """
-        acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS)
+        ensure_dialogue_is_annotated(dialogue)
         nb_accepted = sum(
             1
             for utterance in dialogue.utterances
             if utterance.participant == DialogueParticipant.USER
-            and any(intent in acc for intent in utterance.get_intents())
+            and any(
+                intent in acceptance_intents
+                for intent in utterance.get_intents()
+            )
         )
         return nb_accepted / len(dialogue.utterances)
diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py
index 4d6e9d86..9d3dd3bd 100644
--- a/usersimcrs/evaluation/success_rate_metric.py
+++ b/usersimcrs/evaluation/success_rate_metric.py
@@ -3,71 +3,57 @@
 Evaluates whether at least one recommendation was accepted during a dialogue.
 """
 
-from typing import Any, List, Optional
+from typing import Any, List
 
 from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
 
 from usersimcrs.evaluation.base_metric import BaseMetric
 from usersimcrs.evaluation.dialogue_annotation import (
-    DEFAULT_ACC_LABELS,
-    DEFAULT_REC_LABELS,
-    DEFAULT_REJ_LABELS,
-    annotate_if_needed,
+    ensure_dialogue_is_annotated,
     get_recommendation_rounds,
     is_recommendation_accepted,
-    resolve_intents,
 )
 
 
 class SuccessRateMetric(BaseMetric):
     def __init__(
         self,
-        user_nlu_config_path: Optional[str] = None,
-        agent_nlu_config_path: Optional[str] = None,
         name: str = "success_rate",
     ) -> None:
         """Initializes the success rate metric.
 
         Args:
-            user_nlu_config_path: Path to user NLU configuration.
-            agent_nlu_config_path: Path to agent NLU configuration.
             name: Metric name.
         """
         super().__init__(name)
-        self._user_nlu_config_path = user_nlu_config_path
-        self._agent_nlu_config_path = agent_nlu_config_path
 
     def evaluate_dialogue(
         self,
         dialogue: Dialogue,
-        recommendation_intent_labels: Optional[List[str]] = None,
-        acceptance_intent_labels: Optional[List[str]] = None,
-        rejection_intent_labels: Optional[List[str]] = None,
+        recommendation_intents: List[Intent],
+        acceptance_intents: List[Intent],
+        rejection_intents: List[Intent],
         **kwargs: Any,
     ) -> float:
         """Computes the success rate for a single dialogue.
 
         Args:
             dialogue: Dialogue to evaluate.
-            recommendation_intent_labels: Labels for recommendation intents.
-                Defaults to ``["REC-S", "REC-E"]``.
-            acceptance_intent_labels: Labels for acceptance intents.
-                Defaults to ``["ACC"]``.
-            rejection_intent_labels: Labels for rejection intents.
-                Defaults to ``["REJ"]``.
+            recommendation_intents: Intents that indicate recommendation.
+            acceptance_intents: Intents that indicate acceptance.
+            rejection_intents: Intents that indicate rejection.
 
         Returns:
             1.0 if at least one recommendation was accepted, 0.0 otherwise.
         """
-        annotate_if_needed(
-            dialogue,
-            self._user_nlu_config_path,
-            self._agent_nlu_config_path,
-        )
-        rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS)
-        acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS)
-        rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS)
-        rounds = get_recommendation_rounds(dialogue, rec)
+        ensure_dialogue_is_annotated(dialogue)
+        rounds = get_recommendation_rounds(dialogue, recommendation_intents)
         return float(
-            any(is_recommendation_accepted(r, acc, rej) for r in rounds)
+            any(
+                is_recommendation_accepted(
+                    r, acceptance_intents, rejection_intents
+                )
+                for r in rounds
+            )
         )
diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
index c5fe6fe6..ce2e9b1f 100644
--- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
+++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
@@ -4,18 +4,16 @@
 rounds in a dialogue.
 """
 
-from typing import Any, List, Optional
+from typing import Any, List
 
 from dialoguekit.core.dialogue import Dialogue
+from dialoguekit.core.intent import Intent
 
 from usersimcrs.evaluation.base_metric import BaseMetric
 from usersimcrs.evaluation.dialogue_annotation import (
-    DEFAULT_ACC_LABELS,
-    DEFAULT_REC_LABELS,
-    DEFAULT_REJ_LABELS,
+    ensure_dialogue_is_annotated,
     get_recommendation_rounds,
     is_recommendation_accepted,
-    resolve_intents,
 )
 
 
@@ -27,38 +25,38 @@ def __init__(
         """Initializes the successful recommendation round ratio metric.
 
         Args:
-            name: Metric name.
+            name: Metric name. Defaults to
+                    "successful_recommendation_round_ratio".
         """
         super().__init__(name)
 
     def evaluate_dialogue(
         self,
         dialogue: Dialogue,
-        recommendation_intent_labels: Optional[List[str]] = None,
-        acceptance_intent_labels: Optional[List[str]] = None,
-        rejection_intent_labels: Optional[List[str]] = None,
+        recommendation_intents: List[Intent],
+        acceptance_intents: List[Intent],
+        rejection_intents: List[Intent],
         **kwargs: Any,
     ) -> float:
         """Computes the successful recommendation round ratio.
 
         Args:
             dialogue: Dialogue to evaluate.
-            recommendation_intent_labels: Labels for recommendation intents.
-                Defaults to ``["REC-S", "REC-E"]``.
-            acceptance_intent_labels: Labels for acceptance intents.
-                Defaults to ``["ACC"]``.
-            rejection_intent_labels: Labels for rejection intents.
-                Defaults to ``["REJ"]``.
+            recommendation_intents: Intents that indicate recommendation.
+            acceptance_intents: Intents that indicate acceptance.
+            rejection_intents: Intents that indicate rejection.
 
         Returns:
             Ratio of accepted recommendation rounds to total rounds,
             or 0.0 if there are no recommendation rounds.
         """
-        rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS)
-        acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS)
-        rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS)
-        rounds = get_recommendation_rounds(dialogue, rec)
+        ensure_dialogue_is_annotated(dialogue)
+        rounds = get_recommendation_rounds(dialogue, recommendation_intents)
         successful = sum(
-            1 for r in rounds if is_recommendation_accepted(r, acc, rej)
+            1
+            for r in rounds
+            if is_recommendation_accepted(
+                r, acceptance_intents, rejection_intents
+            )
         )
         return successful / len(rounds) if rounds else 0.0

From 3b068b4a779302bb6f2668311fb76b8b19ab9a68 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 24 Mar 2026 11:55:03 +0300
Subject: [PATCH 34/38] resolve issues

---
 usersimcrs/evaluation/base_metric.py          |  48 ++++++++
 usersimcrs/evaluation/dialogue_annotation.py  | 107 +++---------------
 .../reward_per_dialogue_length_metric.py      |   2 +-
 ...ssful_recommendation_round_ratio_metric.py |   4 +-
 4 files changed, 65 insertions(+), 96 deletions(-)
 create mode 100644 usersimcrs/evaluation/base_metric.py

diff --git a/usersimcrs/evaluation/base_metric.py b/usersimcrs/evaluation/base_metric.py
new file mode 100644
index 00000000..c99399a2
--- /dev/null
+++ b/usersimcrs/evaluation/base_metric.py
@@ -0,0 +1,48 @@
+"""Abstract base class for dialogue evaluation metrics."""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+from dialoguekit.core.dialogue import Dialogue
+
+
+class BaseMetric(ABC):
+    def __init__(self, name: str) -> None:
+        """Initializes the metric.
+
+        Args:
+            name: Metric name.
+        """
+        self.name = name
+
+    @abstractmethod
+    def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float:
+        """Computes the metric for a single dialogue.
+
+        Args:
+            dialogue: Single dialogue to score.
+            **kwargs: Additional arguments specific to the metric.
+
+        Raises:
+            NotImplementedError: When not implemented by a subclass.
+
+        Returns:
+            Score for the dialogue.
+        """
+        raise NotImplementedError()
+
+    def evaluate_dialogues(
+        self, dialogues: List[Dialogue], **kwargs: Any
+    ) -> Dict[str, float]:
+        """Computes the metric for every dialogue in a given list.
+
+        Args:
+            dialogues: Dialogues.
+            **kwargs: Additional arguments specific to the metric.
+
+        Returns:
+            Dictionary with result per dialogue. Keys are conversation IDs.
+        """
+        return {
+            dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs)
+            for dialogue in dialogues
+        }
diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py
index 0039594f..2cd879d5 100644
--- a/usersimcrs/evaluation/dialogue_annotation.py
+++ b/usersimcrs/evaluation/dialogue_annotation.py
@@ -5,7 +5,7 @@
 assessing recommendation acceptance.
 """
 
-from typing import Dict, List, Optional, Sequence, Tuple
+from typing import List
 
 from confuse import Configuration
 
@@ -14,80 +14,14 @@
 from dialoguekit.core.intent import Intent
 from dialoguekit.nlu.nlu import NLU
 from dialoguekit.participant.participant import DialogueParticipant
-
 from usersimcrs.utils.simulation_utils import get_NLU
 
 
-DEFAULT_REC_LABELS = ["REC-S", "REC-E"]
-DEFAULT_ACC_LABELS = ["ACC"]
-DEFAULT_REJ_LABELS = ["REJ"]
-
-_intent_cache: Dict[Tuple[str, ...], List[Intent]] = {}
-_nlu_cache: Dict[str, NLU] = {}
-
-
-def resolve_intents(
-    labels: Optional[Sequence[str]], defaults: List[str]
-) -> List[Intent]:
-    """Resolves optional label overrides to a cached list of Intents.
-
-    Args:
-        labels: Custom labels or None to use defaults.
-        defaults: Default label strings.
-
-    Returns:
-        Cached list of Intent objects.
-    """
-    key = tuple(labels if labels is not None else defaults)
-    if key not in _intent_cache:
-        _intent_cache[key] = [Intent(label) for label in key]
-    return _intent_cache[key]
-
-
-DEFAULT_REC_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_REC_LABELS)
-DEFAULT_ACC_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_ACC_LABELS)
-DEFAULT_REJ_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_REJ_LABELS)
-
-
 def ensure_dialogue_is_annotated(dialogue: Dialogue) -> None:
-    """Raises if a dialogue is not annotated with annotated utterances."""
+    """Raises error if dialogue utterances are not annotated."""
     for utterance in dialogue.utterances:
         if not isinstance(utterance, AnnotatedUtterance):
-            raise ValueError(
-                "Dialogue must be annotated (utterances must be "
-                "`AnnotatedUtterance`)."
-            )
-
-
-def annotate_if_needed(
-    dialogue: Dialogue,
-    user_nlu_config_path: Optional[str] = None,
-    agent_nlu_config_path: Optional[str] = None,
-) -> None:
-    """Annotates the dialogue with NLU if config paths are provided.
-
-    NLU modules are loaded lazily and cached by config path.
-
-    Args:
-        dialogue: Dialogue to annotate.
-        user_nlu_config_path: Path to user NLU configuration.
-        agent_nlu_config_path: Path to agent NLU configuration.
-    """
-    if not user_nlu_config_path or not agent_nlu_config_path:
-        return
-    if user_nlu_config_path not in _nlu_cache:
-        _nlu_cache[user_nlu_config_path] = load_nlu(
-            user_nlu_config_path, "User NLU Configuration"
-        )
-    if agent_nlu_config_path not in _nlu_cache:
-        _nlu_cache[agent_nlu_config_path] = load_nlu(
-            agent_nlu_config_path, "Agent NLU Configuration"
-        )
-    annotate_dialogue(
-        dialogue,
-        _nlu_cache[user_nlu_config_path],
-        _nlu_cache[agent_nlu_config_path],
-    )
+            raise RuntimeError("Dialogue must be annotated.")
 
 
 def annotate_dialogue(
@@ -96,7 +30,7 @@ def annotate_dialogue(
     """Annotates utterances with dialogue acts.
 
     Each utterance that is not already an AnnotatedUtterance is converted to
-    one. Utterances that already carry dialogue acts are left untouched.
+      one. Utterances that already carry dialogue acts are left untouched.
 
     Args:
         dialogue: Dialogue to be annotated.
@@ -131,39 +65,26 @@ def annotate_dialogue(
     return dialogue
 
 
-def load_nlu(
-    nlu_config_path: str,
-    config_name: str = "NLU Configuration",
-) -> NLU:
-    """Loads a single NLU module from the given configuration file.
-
-    Args:
-        nlu_config_path: Path to the NLU configuration file.
-        config_name: Name for the Configuration instance. Defaults to
-            ``"NLU Configuration"``.
-
-    Returns:
-        NLU module.
-    """
-    nlu_config = Configuration(config_name)
-    nlu_config.set_file(nlu_config_path)
-    return get_NLU(nlu_config)
-
-
 def annotate_dialogues(
     dialogues: List[Dialogue],
     user_nlu_config_path: str,
     agent_nlu_config_path: str,
 ) -> None:
-    """Annotates a batch of dialogues in place, loading NLU modules once.
+    """Annotates dialogues in place using NLU modules loaded once.
 
     Args:
         dialogues: Dialogues to annotate (modified in place).
         user_nlu_config_path: Path to user NLU configuration file.
         agent_nlu_config_path: Path to agent NLU configuration file.
     """
-    user_nlu = load_nlu(user_nlu_config_path, "User NLU Configuration")
-    agent_nlu = load_nlu(agent_nlu_config_path, "Agent NLU Configuration")
+    user_nlu_config = Configuration("User NLU Configuration")
+    user_nlu_config.set_file(user_nlu_config_path)
+    user_nlu = get_NLU(user_nlu_config)
+
+    agent_nlu_config = Configuration("Agent NLU Configuration")
+    agent_nlu_config.set_file(agent_nlu_config_path)
+    agent_nlu = get_NLU(agent_nlu_config)
+
     for dialogue in dialogues:
         annotate_dialogue(dialogue, user_nlu, agent_nlu)
 
@@ -174,7 +95,7 @@ def get_recommendation_rounds(
     """Splits a dialogue into recommendation rounds.
 
     A new round begins each time an utterance contains a recommendation
-    intent.
+      intent.
 
     Args:
         dialogue: Annotated dialogue.
diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
index c08881b1..ecabb410 100644
--- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
+++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py
@@ -37,7 +37,7 @@ def evaluate_dialogue(
 
         Args:
             dialogue: Dialogue to evaluate.
-            acceptance_intents: Acceptance intents (e.g., ``[Intent("ACC")]``).
+            acceptance_intents: Acceptance intents.
 
         Returns:
             Ratio of accepted recommendations to total utterances.
diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
index ce2e9b1f..a544696d 100644
--- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
+++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py
@@ -26,7 +26,7 @@ def __init__(
 
         Args:
             name: Metric name. Defaults to
-                    "successful_recommendation_round_ratio".
+              "successful_recommendation_round_ratio".
         """
         super().__init__(name)
 
@@ -48,7 +48,7 @@ def evaluate_dialogue(
 
         Returns:
             Ratio of accepted recommendation rounds to total rounds,
-            or 0.0 if there are no recommendation rounds.
+              or 0.0 if there are no recommendation rounds.
         """
         ensure_dialogue_is_annotated(dialogue)
         rounds = get_recommendation_rounds(dialogue, recommendation_intents)

From 89446219dfa74d31a214e2e221a9edc143dbd39b Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 24 Mar 2026 12:37:39 +0300
Subject: [PATCH 35/38] fixes

---
 usersimcrs/evaluation/dialogue_annotation.py | 23 +++++---------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py
index 2cd879d5..58ffb2c9 100644
--- a/usersimcrs/evaluation/dialogue_annotation.py
+++ b/usersimcrs/evaluation/dialogue_annotation.py
@@ -7,14 +7,11 @@
 
 from typing import List
 
-from confuse import Configuration
-
 from dialoguekit.core.annotated_utterance import AnnotatedUtterance
 from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.core.intent import Intent
 from dialoguekit.nlu.nlu import NLU
 from dialoguekit.participant.participant import DialogueParticipant
-from usersimcrs.utils.simulation_utils import get_NLU
 
 
 def ensure_dialogue_is_annotated(dialogue: Dialogue) -> None:
@@ -30,7 +27,7 @@ def annotate_dialogue(
     """Annotates utterances with dialogue acts.
 
     Each utterance that is not already an AnnotatedUtterance is converted to
-      one. Utterances that already carry dialogue acts are left untouched.
+    one. Utterances that already carry dialogue acts are left untouched.
 
     Args:
         dialogue: Dialogue to be annotated.
@@ -67,24 +64,16 @@ def annotate_dialogue(
 
 def annotate_dialogues(
     dialogues: List[Dialogue],
-    user_nlu_config_path: str,
-    agent_nlu_config_path: str,
+    user_nlu: NLU,
+    agent_nlu: NLU,
 ) -> None:
-    """Annotates dialogues in place using NLU modules loaded once.
+    """Annotates dialogues in place using provided NLU modules.
 
     Args:
         dialogues: Dialogues to annotate (modified in place).
-        user_nlu_config_path: Path to user NLU configuration file.
-        agent_nlu_config_path: Path to agent NLU configuration file.
+        user_nlu: NLU module for user utterances.
+        agent_nlu: NLU module for agent utterances.
     """
-    user_nlu_config = Configuration("User NLU Configuration")
-    user_nlu_config.set_file(user_nlu_config_path)
-    user_nlu = get_NLU(user_nlu_config)
-
-    agent_nlu_config = Configuration("Agent NLU Configuration")
-    agent_nlu_config.set_file(agent_nlu_config_path)
-    agent_nlu = get_NLU(agent_nlu_config)
-
     for dialogue in dialogues:
         annotate_dialogue(dialogue, user_nlu, agent_nlu)
 

From 8be81e42bf65d8f03e836f5bf9b8cf6ce571339a Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 24 Mar 2026 17:59:40 +0300
Subject: [PATCH 36/38] 234-create-main-evaluation-script add eval script

---
 config/default/config_evaluation.yaml |  32 +++
 usersimcrs/evaluation/main.py         | 299 ----------------------
 usersimcrs/run_evaluation.py          | 351 ++++++++++++++++++++++++++
 3 files changed, 383 insertions(+), 299 deletions(-)
 create mode 100644 config/default/config_evaluation.yaml
 delete mode 100644 usersimcrs/evaluation/main.py
 create mode 100644 usersimcrs/run_evaluation.py

diff --git a/config/default/config_evaluation.yaml b/config/default/config_evaluation.yaml
new file mode 100644
index 00000000..455fc3c0
--- /dev/null
+++ b/config/default/config_evaluation.yaml
@@ -0,0 +1,32 @@
+dialogues: data/datasets/moviebot/annotated_dialogues.json
+metrics:
+  - satisfaction
+  - success_rate
+  - successful_recommendation_round_ratio
+  - reward_per_dialogue_length
+output: data/evaluation/moviebot_non_quality_results.json
+
+quality_llm_interface:
+  llm_interface_class_path: "usersimcrs.llm_interfaces.ollama_interface.OllamaLLMInterface"
+  llm_interface_args:
+    configuration_path: config/llm_interface/config_ollama_default.yaml
+    default_response: ""
+quality_aspects:
+  - REC_RELEVANCE
+  - COM_STYLE
+  - FLUENCY
+  - CONV_FLOW
+  - OVERALL_SAT
+
+user_nlu_config: config/default/config_default.yaml
+agent_nlu_config: config/default/config_default.yaml
+
+recommendation_intent_labels:
+  - REVEAL
+  - REVEAL.SIMILAR
+  - REVEAL.NONE
+  - REVEAL.REVISE
+accept_intent_labels:
+  - NOTE.ACCEPT
+reject_intent_labels:
+  - NOTE.DISLIKE
\ No newline at end of file
diff --git a/usersimcrs/evaluation/main.py b/usersimcrs/evaluation/main.py
deleted file mode 100644
index fa1cf048..00000000
--- a/usersimcrs/evaluation/main.py
+++ /dev/null
@@ -1,299 +0,0 @@
-"""Unified script for evaluating dialogues with selected metrics."""
-
-import argparse
-import json
-import os
-from collections import defaultdict
-from statistics import mean, stdev
-from typing import Any, Dict, List, Mapping, Sequence
-
-from dialoguekit.core.dialogue import Dialogue
-from dialoguekit.nlu.models.satisfaction_classifier import (
-    SatisfactionClassifierSVM,
-)
-from dialoguekit.utils.dialogue_reader import json_to_dialogues
-
-from usersimcrs.evaluation.base_metric import BaseMetric
-from usersimcrs.evaluation.dialogue_annotation import annotate_dialogues
-from usersimcrs.evaluation.quality_metric import QualityMetric
-from usersimcrs.evaluation.quality_rubrics import QualityRubrics
-from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric
-from usersimcrs.evaluation.reward_per_dialogue_length_metric import (
-    RewardPerDialogueLengthMetric,
-)
-from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric
-from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import (
-    SuccessfulRecommendationRoundRatioMetric,
-)
-from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface
-
-UTILITY_METRICS = {
-    "success_rate",
-    "successful_recommendation_round_ratio",
-    "reward_per_dialogue_length",
-}
-
-SUPPORTED_METRICS = [
-    "quality",
-    "satisfaction",
-    "success_rate",
-    "successful_recommendation_round_ratio",
-    "reward_per_dialogue_length",
-]
-
-
-def parse_args() -> argparse.Namespace:
-    """Parses command-line arguments."""
-    parser = argparse.ArgumentParser(prog="usersimcrs.evaluation.main")
-    parser.add_argument(
-        "--dialogues",
-        type=str,
-        required=True,
-        help="Path to the dialogues JSON file.",
-    )
-    parser.add_argument(
-        "--metrics",
-        nargs="+",
-        required=True,
-        choices=SUPPORTED_METRICS,
-        help="List of metrics to compute.",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        required=True,
-        help="Path to save evaluation results as JSON.",
-    )
-    parser.add_argument(
-        "--ollama_config",
-        type=str,
-        help="Path to Ollama config file (required when quality is selected).",
-    )
-    parser.add_argument(
-        "--quality_aspects",
-        nargs="+",
-        default=[aspect.name for aspect in QualityRubrics],
-        help=(
-            "Quality aspects to evaluate. "
-            "Defaults to all aspects in QualityRubrics."
-        ),
-    )
-    parser.add_argument(
-        "--user_nlu_config",
-        type=str,
-        help=(
-            "Path to user NLU config (required for utility metrics: "
-            "success_rate, successful_recommendation_round_ratio, "
-            "reward_per_dialogue_length)."
-        ),
-    )
-    parser.add_argument(
-        "--agent_nlu_config",
-        type=str,
-        help=(
-            "Path to agent NLU config (required for utility metrics: "
-            "success_rate, successful_recommendation_round_ratio, "
-            "reward_per_dialogue_length)."
-        ),
-    )
-    parser.add_argument(
-        "--reject_intent_labels",
-        nargs="+",
-        default=["REJ"],
-        help="Intent labels corresponding to rejection.",
-    )
-    parser.add_argument(
-        "--accept_intent_labels",
-        nargs="+",
-        default=["ACC"],
-        help="Intent labels corresponding to acceptance.",
-    )
-    parser.add_argument(
-        "--recommendation_intent_labels",
-        nargs="+",
-        default=["REC-S", "REC-E"],
-        help="Intent labels corresponding to recommendation.",
-    )
-    return parser.parse_args()
-
-
-def _validate_args(args: argparse.Namespace) -> None:
-    """Validates metric-specific CLI requirements."""
-    if "quality" in args.metrics and not args.ollama_config:
-        raise ValueError(
-            "The --ollama_config argument is required when using quality."
-        )
-
-    if UTILITY_METRICS.intersection(set(args.metrics)):
-        if not args.user_nlu_config or not args.agent_nlu_config:
-            raise ValueError(
-                "Both --user_nlu_config and --agent_nlu_config are required "
-                "for utility metrics."
-            )
-
-    supported_aspect_names = [aspect.name for aspect in QualityRubrics]
-    invalid_aspects = [
-        aspect
-        for aspect in args.quality_aspects
-        if aspect not in supported_aspect_names
-    ]
-    if invalid_aspects:
-        raise ValueError(
-            f"Unknown quality aspect(s): {invalid_aspects}. "
-            f"Supported aspects: {supported_aspect_names}"
-        )
-
-
-def _build_metric_registry(args: argparse.Namespace) -> Dict[str, BaseMetric]:
-    """Builds metric instances keyed by metric name."""
-    registry: Dict[str, BaseMetric] = {}
-    if "quality" in args.metrics:
-        llm_interface = OllamaLLMInterface(
-            configuration_path=args.ollama_config,
-            default_response="",
-        )
-        registry["quality"] = QualityMetric(llm_interface=llm_interface)
-    if "satisfaction" in args.metrics:
-        registry["satisfaction"] = SatisfactionMetric(
-            classifier=SatisfactionClassifierSVM()
-        )
-    if "success_rate" in args.metrics:
-        registry["success_rate"] = SuccessRateMetric()
-    if "successful_recommendation_round_ratio" in args.metrics:
-        registry[
-            "successful_recommendation_round_ratio"
-        ] = SuccessfulRecommendationRoundRatioMetric()
-    if "reward_per_dialogue_length" in args.metrics:
-        registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric()
-    return registry
-
-
-def _summarize_by_agent(
-    dialogues: Sequence[Dialogue], scores: Mapping[str, float]
-) -> Dict[str, Dict[str, float]]:
-    """Returns aggregate statistics by agent."""
-    conversation_to_agent = {
-        dialogue.conversation_id: dialogue.agent_id for dialogue in dialogues
-    }
-    grouped_scores: Dict[str, List[float]] = defaultdict(list)
-    for conversation_id, score in scores.items():
-        agent_id = conversation_to_agent.get(conversation_id, "unknown")
-        grouped_scores[agent_id].append(score)
-
-    summary: Dict[str, Dict[str, float]] = {}
-    for agent_id, agent_scores in grouped_scores.items():
-        summary[agent_id] = {
-            "count": float(len(agent_scores)),
-            "min": min(agent_scores),
-            "max": max(agent_scores),
-            "mean": mean(agent_scores),
-            "stdev": stdev(agent_scores) if len(agent_scores) > 1 else 0.0,
-        }
-    return summary
-
-
-def _evaluate_metric(
-    metric_name: str,
-    metric: BaseMetric,
-    dialogues: Sequence[Dialogue],
-    args: argparse.Namespace,
-) -> Dict[str, object]:
-    """Runs one metric and returns per-dialogue scores and summary."""
-    if metric_name == "quality":
-        per_aspect: Dict[str, Dict[str, Any]] = {}
-        for aspect in args.quality_aspects:
-            per_dialogue = metric.evaluate_dialogues(
-                list(dialogues),
-                aspect=aspect,
-            )
-            per_aspect[aspect] = {
-                "per_dialogue": per_dialogue,
-                "summary_by_agent": _summarize_by_agent(
-                    dialogues, per_dialogue
-                ),
-            }
-        return {"aspects": per_aspect}
-
-    eval_kwargs = {}
-    if metric_name in UTILITY_METRICS:
-        eval_kwargs = {
-            "recommendation_intent_labels": args.recommendation_intent_labels,
-            "acceptance_intent_labels": args.accept_intent_labels,
-            "rejection_intent_labels": args.reject_intent_labels,
-        }
-
-    per_dialogue_scores = metric.evaluate_dialogues(
-        list(dialogues), **eval_kwargs
-    )
-    return {
-        "per_dialogue": per_dialogue_scores,
-        "summary_by_agent": _summarize_by_agent(dialogues, per_dialogue_scores),
-    }
-
-
-def _print_brief_summary(results: Mapping[str, object]) -> None:
-    """Prints a concise summary in the terminal."""
-    metric_results = results.get("metrics", {})
-    if not isinstance(metric_results, dict):
-        return
-    for metric_name, metric_result in metric_results.items():
-        print(f"Metric: {metric_name}")
-        if metric_name == "quality":
-            aspects = metric_result.get("aspects", {})
-            for aspect_name, aspect_result in aspects.items():
-                print(f"  Aspect: {aspect_name}")
-                for agent_id, stats in aspect_result[
-                    "summary_by_agent"
-                ].items():
-                    print(
-                        f"    Agent: {agent_id} | mean={stats['mean']:.3f} "
-                        f"stdev={stats['stdev']:.3f}"
-                    )
-            continue
-
-        for agent_id, stats in metric_result["summary_by_agent"].items():
-            print(
-                f"  Agent: {agent_id} | mean={stats['mean']:.3f} "
-                f"stdev={stats['stdev']:.3f}"
-            )
-
-
-def main() -> None:
-    args = parse_args()
-    _validate_args(args)
-
-    dialogues = json_to_dialogues(args.dialogues)
-
-    if UTILITY_METRICS.intersection(set(args.metrics)):
-        annotate_dialogues(
-            dialogues, args.user_nlu_config, args.agent_nlu_config
-        )
-
-    metric_registry = _build_metric_registry(args)
-
-    results: Dict[str, Any] = {
-        "dialogues_path": args.dialogues,
-        "metrics_requested": args.metrics,
-        "metrics": {},
-    }
-
-    for metric_name in args.metrics:
-        metric = metric_registry[metric_name]
-        results["metrics"][metric_name] = _evaluate_metric(
-            metric_name,
-            metric,
-            dialogues,
-            args,
-        )
-
-    output_dir = os.path.dirname(args.output)
-    if output_dir:
-        os.makedirs(output_dir, exist_ok=True)
-    with open(args.output, "w") as f:
-        json.dump(results, f, indent=2)
-
-    _print_brief_summary(results)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/usersimcrs/run_evaluation.py b/usersimcrs/run_evaluation.py
new file mode 100644
index 00000000..706de52a
--- /dev/null
+++ b/usersimcrs/run_evaluation.py
@@ -0,0 +1,351 @@
+"""Console application for running evaluation."""
+
+import argparse
+import json
+import os
+from collections import defaultdict
+from statistics import mean, stdev
+from typing import Any, Dict, List, Mapping, Sequence
+
+import confuse
+from dialoguekit.core.intent import Intent
+from dialoguekit.nlu.models.satisfaction_classifier import (
+    SatisfactionClassifierSVM,
+)
+from dialoguekit.utils.dialogue_reader import json_to_dialogues
+
+from usersimcrs.evaluation.dialogue_annotation import annotate_dialogues
+from usersimcrs.evaluation.quality_metric import QualityMetric
+from usersimcrs.evaluation.quality_rubrics import QualityRubrics
+from usersimcrs.evaluation.reward_per_dialogue_length_metric import (
+    RewardPerDialogueLengthMetric,
+)
+from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric
+from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric
+from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import (
+    SuccessfulRecommendationRoundRatioMetric,
+)
+from usersimcrs.utils.simulation_utils import get_NLU, get_llm_interface
+
+DEFAULT_CONFIG_PATH = "config/default/config_evaluation.yaml"
+UTILITY_METRICS = {
+    "success_rate",
+    "successful_recommendation_round_ratio",
+    "reward_per_dialogue_length",
+}
+SUPPORTED_METRICS = [
+    "quality",
+    "satisfaction",
+    "success_rate",
+    "successful_recommendation_round_ratio",
+    "reward_per_dialogue_length",
+]
+
+
+def parse_args() -> argparse.Namespace:
+    """Defines accepted arguments and returns the parsed values."""
+    parser = argparse.ArgumentParser(prog="run_evaluation.py")
+    parser.add_argument(
+        "-c",
+        "--config-file",
+        help=(
+            "Path to configuration file to overwrite default values. "
+            "Defaults to None."
+        ),
+    )
+    parser.add_argument("--dialogues", type=str, help="Dialogues JSON file.")
+    parser.add_argument(
+        "--metrics",
+        nargs="+",
+        choices=SUPPORTED_METRICS,
+        help="Metrics to compute.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        help="Path to save evaluation results as JSON.",
+    )
+    parser.add_argument(
+        "--quality_aspects",
+        nargs="+",
+        help="Quality aspects to evaluate.",
+    )
+    parser.add_argument(
+        "--user_nlu_config",
+        type=str,
+        help="User NLU configuration file.",
+    )
+    parser.add_argument(
+        "--agent_nlu_config",
+        type=str,
+        help="Agent NLU configuration file.",
+    )
+    parser.add_argument(
+        "--reject_intent_labels",
+        nargs="+",
+        help="Intent labels corresponding to rejection.",
+    )
+    parser.add_argument(
+        "--accept_intent_labels",
+        nargs="+",
+        help="Intent labels corresponding to acceptance.",
+    )
+    parser.add_argument(
+        "--recommendation_intent_labels",
+        nargs="+",
+        help="Intent labels corresponding to recommendation.",
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        action="store_const",
+        const=True,
+        help="Debug mode.",
+    )
+    return parser.parse_args()
+
+
+def load_config(args: argparse.Namespace) -> confuse.Configuration:
+    """Loads config from default file, custom file, and CLI overrides."""
+    config = confuse.Configuration("usersimcrs")
+    config.set_file(DEFAULT_CONFIG_PATH)
+    if args.config_file:
+        config.set_file(args.config_file)
+    config.set_args(args, dots=True)
+    return config
+
+
+def validate_config(config: confuse.Configuration) -> List[str]:
+    """Validates evaluation config and returns quality aspects."""
+    metrics = config["metrics"].get()
+    if "quality" in metrics and "quality_llm_interface" not in config:
+        raise ValueError("Quality evaluation requires `quality_llm_interface`.")
+
+    quality_aspects = config["quality_aspects"].get()
+    supported_aspects = [aspect.name for aspect in QualityRubrics]
+    invalid_aspects = [
+        aspect for aspect in quality_aspects if aspect not in supported_aspects
+    ]
+    if invalid_aspects:
+        raise ValueError(
+            f"Unknown quality aspect(s): {invalid_aspects}. "
+            f"Supported aspects: {supported_aspects}"
+        )
+
+    if UTILITY_METRICS.intersection(set(metrics)):
+        if not config["user_nlu_config"].get(None):
+            raise ValueError(
+                "`user_nlu_config` is required for utility metrics."
+            )
+        if not config["agent_nlu_config"].get(None):
+            raise ValueError(
+                "`agent_nlu_config` is required for utility metrics."
+            )
+
+    return quality_aspects
+
+
+def load_nlu(config_path: str, name: str) -> Any:
+    """Loads one NLU component from a config path."""
+    nlu_config = confuse.Configuration(name)
+    nlu_config.set_file(config_path)
+    return get_NLU(nlu_config)
+
+
+def annotate_for_utility(
+    dialogues: List[Any], config: confuse.Configuration, metrics: Sequence[str]
+) -> None:
+    """Annotates dialogues when utility metrics are requested."""
+    if not UTILITY_METRICS.intersection(set(metrics)):
+        return
+
+    user_nlu = load_nlu(
+        config["user_nlu_config"].get(), "User NLU Configuration"
+    )
+    agent_nlu = load_nlu(
+        config["agent_nlu_config"].get(), "Agent NLU Configuration"
+    )
+    annotate_dialogues(dialogues, user_nlu, agent_nlu)
+
+
+def get_summary_by_agent(
+    dialogues: Sequence[Any], scores: Mapping[str, float]
+) -> Dict[str, Dict[str, float]]:
+    """Aggregates metric scores by agent."""
+    grouped_scores: Dict[str, List[float]] = defaultdict(list)
+    for dialogue in dialogues:
+        grouped_scores[dialogue.agent_id].append(
+            scores[dialogue.conversation_id]
+        )
+
+    return {
+        agent_id: {
+            "count": len(agent_scores),
+            "min": min(agent_scores),
+            "max": max(agent_scores),
+            "mean": mean(agent_scores),
+            "stdev": stdev(agent_scores) if len(agent_scores) > 1 else 0.0,
+        }
+        for agent_id, agent_scores in grouped_scores.items()
+    }
+
+
+def get_utility_intents(
+    config: confuse.Configuration,
+) -> Dict[str, List[Intent]]:
+    """Builds intent lists used by utility metrics."""
+    return {
+        "recommendation_intents": [
+            Intent(label)
+            for label in config["recommendation_intent_labels"].get()
+        ],
+        "acceptance_intents": [
+            Intent(label) for label in config["accept_intent_labels"].get()
+        ],
+        "rejection_intents": [
+            Intent(label) for label in config["reject_intent_labels"].get()
+        ],
+    }
+
+
+def build_metric_registry(
+    config: confuse.Configuration, metrics: Sequence[str]
+) -> Dict[str, Any]:
+    """Builds metric instances."""
+    registry: Dict[str, Any] = {}
+    if "quality" in metrics:
+        registry["quality"] = QualityMetric(
+            llm_interface=get_llm_interface(
+                config["quality_llm_interface"].get()
+            )
+        )
+    if "satisfaction" in metrics:
+        registry["satisfaction"] = SatisfactionMetric(
+            classifier=SatisfactionClassifierSVM()
+        )
+    if "success_rate" in metrics:
+        registry["success_rate"] = SuccessRateMetric()
+    if "successful_recommendation_round_ratio" in metrics:
+        registry[
+            "successful_recommendation_round_ratio"
+        ] = SuccessfulRecommendationRoundRatioMetric()
+    if "reward_per_dialogue_length" in metrics:
+        registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric()
+    return registry
+
+
+def evaluate_metric(
+    metric_name: str,
+    metric: Any,
+    dialogues: List[Any],
+    quality_aspects: Sequence[str],
+    utility_intents: Dict[str, List[Intent]],
+) -> Dict[str, Any]:
+    """Evaluates one metric and returns serialized results."""
+    if metric_name == "quality":
+        return {
+            "aspects": {
+                aspect: {
+                    "per_dialogue": scores,
+                    "summary_by_agent": get_summary_by_agent(dialogues, scores),
+                }
+                for aspect in quality_aspects
+                for scores in [
+                    metric.evaluate_dialogues(dialogues, aspect=aspect)
+                ]
+            }
+        }
+
+    if metric_name in {
+        "success_rate",
+        "successful_recommendation_round_ratio",
+    }:
+        scores = metric.evaluate_dialogues(dialogues, **utility_intents)
+    elif metric_name == "reward_per_dialogue_length":
+        scores = metric.evaluate_dialogues(
+            dialogues,
+            acceptance_intents=utility_intents["acceptance_intents"],
+        )
+    else:
+        scores = metric.evaluate_dialogues(dialogues)
+
+    return {
+        "per_dialogue": scores,
+        "summary_by_agent": get_summary_by_agent(dialogues, scores),
+    }
+
+
+def save_results(
+    config: confuse.Configuration, results: Dict[str, Any]
+) -> None:
+    """Writes config dump and evaluation results to disk."""
+    output_path = config["output"].get()
+    output_dir = os.path.dirname(output_path)
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+
+    output_stem, _ = os.path.splitext(output_path)
+    with open(f"{output_stem}.meta.yaml", "w") as f:
+        f.write(config.dump())
+
+    with open(output_path, "w") as f:
+        json.dump(results, f, indent=2)
+
+
+def print_summary(results: Mapping[str, Any]) -> None:
+    """Prints a concise terminal summary."""
+    for metric_name, metric_result in results["metrics"].items():
+        print(f"Metric: {metric_name}")
+        if metric_name == "quality":
+            for aspect_name, aspect_result in metric_result["aspects"].items():
+                print(f"  Aspect: {aspect_name}")
+                for agent_id, stats in aspect_result[
+                    "summary_by_agent"
+                ].items():
+                    print(
+                        f"    Agent: {agent_id} | mean={stats['mean']:.3f} "
+                        f"stdev={stats['stdev']:.3f}"
+                    )
+            continue
+
+        for agent_id, stats in metric_result["summary_by_agent"].items():
+            print(
+                f"  Agent: {agent_id} | mean={stats['mean']:.3f} "
+                f"stdev={stats['stdev']:.3f}"
+            )
+
+
+def main() -> None:
+    """Runs evaluation based on the resolved configuration."""
+    args = parse_args()
+    config = load_config(args)
+
+    metrics = config["metrics"].get()
+    quality_aspects = validate_config(config)
+    dialogues = json_to_dialogues(config["dialogues"].get())
+    annotate_for_utility(dialogues, config, metrics)
+
+    utility_intents = get_utility_intents(config)
+    metric_registry = build_metric_registry(config, metrics)
+
+    results: Dict[str, Any] = {
+        "dialogues_path": config["dialogues"].get(),
+        "metrics_requested": metrics,
+        "metrics": {},
+    }
+
+    for metric_name in metrics:
+        results["metrics"][metric_name] = evaluate_metric(
+            metric_name,
+            metric_registry[metric_name],
+            dialogues,
+            quality_aspects,
+            utility_intents,
+        )
+
+    save_results(config, results)
+    print_summary(results)
+
+
+if __name__ == "__main__":
+    main()

From b3cd18fa34457b6574c7c774951eb7b82c42f232 Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 14 Apr 2026 17:04:14 +0200
Subject: [PATCH 37/38] fixes

---
 config/default/config_evaluation.yaml |  11 +-
 usersimcrs/run_evaluation.py          | 225 ++++++++++++++++----------
 2 files changed, 151 insertions(+), 85 deletions(-)

diff --git a/config/default/config_evaluation.yaml b/config/default/config_evaluation.yaml
index 455fc3c0..74a60df1 100644
--- a/config/default/config_evaluation.yaml
+++ b/config/default/config_evaluation.yaml
@@ -4,7 +4,7 @@ metrics:
   - success_rate
   - successful_recommendation_round_ratio
   - reward_per_dialogue_length
-output: data/evaluation/moviebot_non_quality_results.json
+output: data/evaluation/moviebot_non_quality_results
 
 quality_llm_interface:
   llm_interface_class_path: "usersimcrs.llm_interfaces.ollama_interface.OllamaLLMInterface"
@@ -18,8 +18,11 @@ quality_aspects:
   - CONV_FLOW
   - OVERALL_SAT
 
-user_nlu_config: config/default/config_default.yaml
-agent_nlu_config: config/default/config_default.yaml
+annotate_dialogues: False
+user_nlu:
+  type: "cosine"
+agent_nlu:
+  type: "cosine"
 
 recommendation_intent_labels:
   - REVEAL
@@ -29,4 +32,4 @@ recommendation_intent_labels:
 accept_intent_labels:
   - NOTE.ACCEPT
 reject_intent_labels:
-  - NOTE.DISLIKE
\ No newline at end of file
+  - NOTE.DISLIKE
diff --git a/usersimcrs/run_evaluation.py b/usersimcrs/run_evaluation.py
index 706de52a..1e1316b3 100644
--- a/usersimcrs/run_evaluation.py
+++ b/usersimcrs/run_evaluation.py
@@ -5,15 +5,17 @@
 import os
 from collections import defaultdict
 from statistics import mean, stdev
-from typing import Any, Dict, List, Mapping, Sequence
+from typing import Any, Dict, List
 
 import confuse
+from dialoguekit.core.dialogue import Dialogue
 from dialoguekit.core.intent import Intent
 from dialoguekit.nlu.models.satisfaction_classifier import (
     SatisfactionClassifierSVM,
 )
 from dialoguekit.utils.dialogue_reader import json_to_dialogues
 
+from usersimcrs.evaluation.base_metric import BaseMetric
 from usersimcrs.evaluation.dialogue_annotation import annotate_dialogues
 from usersimcrs.evaluation.quality_metric import QualityMetric
 from usersimcrs.evaluation.quality_rubrics import QualityRubrics
@@ -28,11 +30,6 @@
 from usersimcrs.utils.simulation_utils import get_NLU, get_llm_interface
 
 DEFAULT_CONFIG_PATH = "config/default/config_evaluation.yaml"
-UTILITY_METRICS = {
-    "success_rate",
-    "successful_recommendation_round_ratio",
-    "reward_per_dialogue_length",
-}
 SUPPORTED_METRICS = [
     "quality",
     "satisfaction",
@@ -43,7 +40,11 @@
 
 
 def parse_args() -> argparse.Namespace:
-    """Defines accepted arguments and returns the parsed values."""
+    """Defines accepted arguments and returns the parsed values.
+
+    Returns:
+        Parsed command-line arguments.
+    """
     parser = argparse.ArgumentParser(prog="run_evaluation.py")
     parser.add_argument(
         "-c",
@@ -63,7 +64,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--output",
         type=str,
-        help="Path to save evaluation results as JSON.",
+        help="Directory to save evaluation results and metadata.",
     )
     parser.add_argument(
         "--quality_aspects",
@@ -71,14 +72,10 @@ def parse_args() -> argparse.Namespace:
         help="Quality aspects to evaluate.",
     )
     parser.add_argument(
-        "--user_nlu_config",
-        type=str,
-        help="User NLU configuration file.",
-    )
-    parser.add_argument(
-        "--agent_nlu_config",
-        type=str,
-        help="Agent NLU configuration file.",
+        "--annotate_dialogues",
+        action="store_const",
+        const=True,
+        help="Annotate dialogues before computing metrics.",
     )
     parser.add_argument(
         "--reject_intent_labels",
@@ -106,17 +103,43 @@ def parse_args() -> argparse.Namespace:
 
 
 def load_config(args: argparse.Namespace) -> confuse.Configuration:
-    """Loads config from default file, custom file, and CLI overrides."""
+    """Loads config from default file, custom file, and CLI overrides.
+
+    Args:
+        args: Arguments parsed with argparse.
+
+    Returns:
+        Resolved evaluation configuration.
+    """
     config = confuse.Configuration("usersimcrs")
     config.set_file(DEFAULT_CONFIG_PATH)
     if args.config_file:
         config.set_file(args.config_file)
     config.set_args(args, dots=True)
+
+    output_dir = config["output"].get()
+    output_stem, output_extension = os.path.splitext(output_dir)
+    if output_extension:
+        output_dir = output_stem
+    os.makedirs(output_dir, exist_ok=True)
+    with open(os.path.join(output_dir, "config.meta.yaml"), "w") as f:
+        f.write(config.dump())
+
     return config
 
 
-def validate_config(config: confuse.Configuration) -> List[str]:
-    """Validates evaluation config and returns quality aspects."""
+def validate_config(config: confuse.Configuration) -> None:
+    """Validates evaluation config.
+
+    Args:
+        config: Configuration generated from YAML configuration file.
+
+    Raises:
+        ValueError: If quality evaluation is requested without an LLM
+            interface, if an unknown quality aspect is configured, or if
+            dialogue annotation is requested without user and agent NLU
+            sections.
+    """
     metrics = config["metrics"].get()
     if "quality" in metrics and "quality_llm_interface" not in config:
         raise ValueError("Quality evaluation requires `quality_llm_interface`.")
@@ -132,46 +155,69 @@ def validate_config(config: confuse.Configuration) -> List[str]:
             f"Supported aspects: {supported_aspects}"
         )
 
-    if UTILITY_METRICS.intersection(set(metrics)):
-        if not config["user_nlu_config"].get(None):
+    if config["annotate_dialogues"].get():
+        if not config["user_nlu"].get(None):
             raise ValueError(
-                "`user_nlu_config` is required for utility metrics."
+                "`user_nlu` is required when `annotate_dialogues` is True."
             )
-        if not config["agent_nlu_config"].get(None):
+        if not config["agent_nlu"].get(None):
             raise ValueError(
-                "`agent_nlu_config` is required for utility metrics."
+                "`agent_nlu` is required when `annotate_dialogues` is True."
             )
 
-    return quality_aspects
 
+def load_nlu(
+    config: confuse.Configuration, nlu_config_key: str, name: str
+) -> Any:
+    """Loads one NLU component from an evaluation config section.
 
-def load_nlu(config_path: str, name: str) -> Any:
-    """Loads one NLU component from a config path."""
+    Args:
+        config: Evaluation configuration.
+        nlu_config_key: Name of the NLU section to load.
+        name: Name for the temporary NLU configuration.
+
+    Returns:
+        NLU component.
+    """
     nlu_config = confuse.Configuration(name)
-    nlu_config.set_file(config_path)
+    nlu_config.set(
+        {
+            "dialogues": config["dialogues"].get(),
+            "nlu": config[nlu_config_key].get(),
+        }
+    )
     return get_NLU(nlu_config)
 
 
-def annotate_for_utility(
-    dialogues: List[Any], config: confuse.Configuration, metrics: Sequence[str]
+def annotate_for_metrics(
+    dialogues: List[Dialogue], config: confuse.Configuration
 ) -> None:
-    """Annotates dialogues when utility metrics are requested."""
-    if not UTILITY_METRICS.intersection(set(metrics)):
+    """Annotates dialogues when requested by configuration.
+
+    Args:
+        dialogues: Dialogues to annotate in place.
+        config: Evaluation configuration.
+    """
+    if not config["annotate_dialogues"].get():
         return
 
-    user_nlu = load_nlu(
-        config["user_nlu_config"].get(), "User NLU Configuration"
-    )
-    agent_nlu = load_nlu(
-        config["agent_nlu_config"].get(), "Agent NLU Configuration"
-    )
+    user_nlu = load_nlu(config, "user_nlu", "User NLU Configuration")
+    agent_nlu = load_nlu(config, "agent_nlu", "Agent NLU Configuration")
     annotate_dialogues(dialogues, user_nlu, agent_nlu)
 
 
 def get_summary_by_agent(
-    dialogues: Sequence[Any], scores: Mapping[str, float]
+    dialogues: List[Dialogue], scores: Dict[str, float]
 ) -> Dict[str, Dict[str, float]]:
-    """Aggregates metric scores by agent."""
+    """Aggregates metric scores by agent.
+
+    Args:
+        dialogues: Evaluated dialogues.
+        scores: Per-dialogue scores keyed by conversation ID.
+
+    Returns:
+        Descriptive score statistics keyed by agent ID.
+    """
     grouped_scores: Dict[str, List[float]] = defaultdict(list)
     for dialogue in dialogues:
         grouped_scores[dialogue.agent_id].append(
@@ -193,7 +239,14 @@ def get_summary_by_agent(
 def get_utility_intents(
     config: confuse.Configuration,
 ) -> Dict[str, List[Intent]]:
-    """Builds intent lists used by utility metrics."""
+    """Builds intent lists used by utility metrics.
+
+    Args:
+        config: Evaluation configuration.
+
+    Returns:
+        Utility intent lists keyed by metric argument name.
+    """
     return {
         "recommendation_intents": [
             Intent(label)
@@ -209,10 +262,18 @@ def get_utility_intents(
 
 
 def build_metric_registry(
-    config: confuse.Configuration, metrics: Sequence[str]
-) -> Dict[str, Any]:
-    """Builds metric instances."""
-    registry: Dict[str, Any] = {}
+    config: confuse.Configuration, metrics: List[str]
+) -> Dict[str, BaseMetric]:
+    """Builds metric instances.
+
+    Args:
+        config: Evaluation configuration.
+        metrics: Names of metrics to evaluate.
+
+    Returns:
+        Metric instances keyed by metric name.
+    """
+    registry: Dict[str, BaseMetric] = {}
     if "quality" in metrics:
         registry["quality"] = QualityMetric(
             llm_interface=get_llm_interface(
@@ -236,25 +297,32 @@ def build_metric_registry(
 
 def evaluate_metric(
     metric_name: str,
-    metric: Any,
-    dialogues: List[Any],
-    quality_aspects: Sequence[str],
+    metric: BaseMetric,
+    dialogues: List[Dialogue],
+    quality_aspects: List[str],
     utility_intents: Dict[str, List[Intent]],
 ) -> Dict[str, Any]:
-    """Evaluates one metric and returns serialized results."""
+    """Evaluates one metric and returns serialized results.
+
+    Args:
+        metric_name: Name of the metric to evaluate.
+        metric: Metric instance.
+        dialogues: Dialogues to evaluate.
+        quality_aspects: Quality aspects to evaluate for quality metrics.
+        utility_intents: Utility intent arguments.
+
+    Returns:
+        Serialized metric result.
+    """
     if metric_name == "quality":
-        return {
-            "aspects": {
-                aspect: {
-                    "per_dialogue": scores,
-                    "summary_by_agent": get_summary_by_agent(dialogues, scores),
-                }
-                for aspect in quality_aspects
-                for scores in [
-                    metric.evaluate_dialogues(dialogues, aspect=aspect)
-                ]
+        aspect_results = {}
+        for aspect in quality_aspects:
+            scores = metric.evaluate_dialogues(dialogues, aspect=aspect)
+            aspect_results[aspect] = {
+                "per_dialogue": scores,
+                "summary_by_agent": get_summary_by_agent(dialogues, scores),
             }
-        }
+        return {"aspects": aspect_results}
 
     if metric_name in {
         "success_rate",
@@ -275,25 +343,12 @@ def evaluate_metric(
     }
 
 
-def save_results(
-    config: confuse.Configuration, results: Dict[str, Any]
-) -> None:
-    """Writes config dump and evaluation results to disk."""
-    output_path = config["output"].get()
-    output_dir = os.path.dirname(output_path)
-    if output_dir:
-        os.makedirs(output_dir, exist_ok=True)
-
-    output_stem, _ = os.path.splitext(output_path)
-    with open(f"{output_stem}.meta.yaml", "w") as f:
-        f.write(config.dump())
-
-    with open(output_path, "w") as f:
-        json.dump(results, f, indent=2)
-
+def print_summary(results: Dict[str, Any]) -> None:
+    """Prints a concise terminal summary.
 
-def print_summary(results: Mapping[str, Any]) -> None:
-    """Prints a concise terminal summary."""
+    Args:
+        results: Serialized evaluation results.
+    """
     for metric_name, metric_result in results["metrics"].items():
         print(f"Metric: {metric_name}")
         if metric_name == "quality":
@@ -321,9 +376,10 @@ def main() -> None:
     config = load_config(args)
 
     metrics = config["metrics"].get()
-    quality_aspects = validate_config(config)
+    validate_config(config)
+    quality_aspects = config["quality_aspects"].get()
     dialogues = json_to_dialogues(config["dialogues"].get())
-    annotate_for_utility(dialogues, config, metrics)
+    annotate_for_metrics(dialogues, config)
 
     utility_intents = get_utility_intents(config)
     metric_registry = build_metric_registry(config, metrics)
@@ -343,7 +399,14 @@ def main() -> None:
             utility_intents,
         )
 
-    save_results(config, results)
+    output_dir = config["output"].get()
+    output_stem, output_extension = os.path.splitext(output_dir)
+    if output_extension:
+        output_dir = output_stem
+
+    with open(os.path.join(output_dir, "results.json"), "w") as f:
+        json.dump(results, f, indent=2)
+
     print_summary(results)
 
 

From 938ccac8208ca61598b9eb7571821a813f806aae Mon Sep 17 00:00:00 2001
From: Ksenia Blokhina <kseniablokhina@MacBook-Pro-Ksenia.local>
Date: Tue, 21 Apr 2026 14:24:21 +0200
Subject: [PATCH 38/38] fix evaluation

---
 config/default/config_evaluation.yaml |   3 +-
 usersimcrs/run_evaluation.py          | 119 +++++++++-----------------
 usersimcrs/utils/simulation_utils.py  |   5 +-
 3 files changed, 46 insertions(+), 81 deletions(-)

diff --git a/config/default/config_evaluation.yaml b/config/default/config_evaluation.yaml
index 74a60df1..6b63cd3e 100644
--- a/config/default/config_evaluation.yaml
+++ b/config/default/config_evaluation.yaml
@@ -1,10 +1,11 @@
 dialogues: data/datasets/moviebot/annotated_dialogues.json
+debug: False
 metrics:
   - satisfaction
   - success_rate
   - successful_recommendation_round_ratio
   - reward_per_dialogue_length
-output: data/evaluation/moviebot_non_quality_results
+output_dir: data/evaluation/moviebot_non_quality_results
 
 quality_llm_interface:
   llm_interface_class_path: "usersimcrs.llm_interfaces.ollama_interface.OllamaLLMInterface"
diff --git a/usersimcrs/run_evaluation.py b/usersimcrs/run_evaluation.py
index 1e1316b3..8f6eaf22 100644
--- a/usersimcrs/run_evaluation.py
+++ b/usersimcrs/run_evaluation.py
@@ -5,7 +5,7 @@
 import os
 from collections import defaultdict
 from statistics import mean, stdev
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import confuse
 from dialoguekit.core.dialogue import Dialogue
@@ -62,7 +62,8 @@ def parse_args() -> argparse.Namespace:
         help="Metrics to compute.",
     )
     parser.add_argument(
-        "--output",
+        "--output-dir",
+        dest="output_dir",
         type=str,
         help="Directory to save evaluation results and metadata.",
     )
@@ -117,12 +118,16 @@ def load_config(args: argparse.Namespace) -> confuse.Configuration:
         config.set_file(args.config_file)
     config.set_args(args, dots=True)
 
-    output_dir = config["output"].get()
+    validate_config(config)
+
+    output_dir = config["output_dir"].get()
     output_stem, output_extension = os.path.splitext(output_dir)
     if output_extension:
         output_dir = output_stem
     os.makedirs(output_dir, exist_ok=True)
-    with open(os.path.join(output_dir, "config.meta.yaml"), "w") as f:
+    with open(
+        os.path.join(output_dir, "config_evaluation.meta.yaml"), "w"
+    ) as f:
         f.write(config.dump())
 
     return config
@@ -135,10 +140,10 @@ def validate_config(config: confuse.Configuration) -> None:
         config: Configuration generated from YAML configuration file.
 
     Raises:
-        ValueError: If quality evaluation is requested without an LLM
-            interface, if an unknown quality aspect is configured, or if
-            dialogue annotation is requested without user and agent NLU
-            sections.
+      ValueError: If quality evaluation is requested without an LLM
+        interface, if an unknown quality aspect is configured, or if
+        dialogue annotation is requested without user and agent NLU
+        sections.
     """
     metrics = config["metrics"].get()
     if "quality" in metrics and "quality_llm_interface" not in config:
@@ -166,43 +171,17 @@ def validate_config(config: confuse.Configuration) -> None:
             )
 
 
-def load_nlu(
-    config: confuse.Configuration, nlu_config_key: str, name: str
-) -> Any:
-    """Loads one NLU component from an evaluation config section.
-
-    Args:
-        config: Evaluation configuration.
-        nlu_config_key: Name of the NLU section to load.
-        name: Name for the temporary NLU configuration.
-
-    Returns:
-        NLU component.
-    """
-    nlu_config = confuse.Configuration(name)
-    nlu_config.set(
-        {
-            "dialogues": config["dialogues"].get(),
-            "nlu": config[nlu_config_key].get(),
-        }
-    )
-    return get_NLU(nlu_config)
-
-
 def annotate_for_metrics(
     dialogues: List[Dialogue], config: confuse.Configuration
 ) -> None:
-    """Annotates dialogues when requested by configuration.
+    """Annotates dialogues for metrics that require dialogue acts.
 
     Args:
         dialogues: Dialogues to annotate in place.
         config: Evaluation configuration.
     """
-    if not config["annotate_dialogues"].get():
-        return
-
-    user_nlu = load_nlu(config, "user_nlu", "User NLU Configuration")
-    agent_nlu = load_nlu(config, "agent_nlu", "Agent NLU Configuration")
+    user_nlu = get_NLU(config, nlu_config_key="user_nlu")
+    agent_nlu = get_NLU(config, nlu_config_key="agent_nlu")
     annotate_dialogues(dialogues, user_nlu, agent_nlu)
 
 
@@ -236,31 +215,6 @@ def get_summary_by_agent(
     }
 
 
-def get_utility_intents(
-    config: confuse.Configuration,
-) -> Dict[str, List[Intent]]:
-    """Builds intent lists used by utility metrics.
-
-    Args:
-        config: Evaluation configuration.
-
-    Returns:
-        Utility intent lists keyed by metric argument name.
-    """
-    return {
-        "recommendation_intents": [
-            Intent(label)
-            for label in config["recommendation_intent_labels"].get()
-        ],
-        "acceptance_intents": [
-            Intent(label) for label in config["accept_intent_labels"].get()
-        ],
-        "rejection_intents": [
-            Intent(label) for label in config["reject_intent_labels"].get()
-        ],
-    }
-
-
 def build_metric_registry(
     config: confuse.Configuration, metrics: List[str]
 ) -> Dict[str, BaseMetric]:
@@ -296,27 +250,25 @@ def build_metric_registry(
 
 
 def evaluate_metric(
-    metric_name: str,
     metric: BaseMetric,
     dialogues: List[Dialogue],
-    quality_aspects: List[str],
-    utility_intents: Dict[str, List[Intent]],
+    quality_aspects: Optional[List[str]] = None,
+    utility_intents: Optional[Dict[str, List[Intent]]] = None,
 ) -> Dict[str, Any]:
     """Evaluates one metric and returns serialized results.
 
     Args:
-        metric_name: Name of the metric to evaluate.
         metric: Metric instance.
         dialogues: Dialogues to evaluate.
         quality_aspects: Quality aspects to evaluate for quality metrics.
-        utility_intents: Utility intent arguments.
+        utility_intents: Utility intent arguments for utility metrics.
 
     Returns:
         Serialized metric result.
     """
-    if metric_name == "quality":
+    if metric.name == "quality":
         aspect_results = {}
-        for aspect in quality_aspects:
+        for aspect in quality_aspects or []:
             scores = metric.evaluate_dialogues(dialogues, aspect=aspect)
             aspect_results[aspect] = {
                 "per_dialogue": scores,
@@ -324,12 +276,13 @@ def evaluate_metric(
             }
         return {"aspects": aspect_results}
 
-    if metric_name in {
+    utility_intents = utility_intents or {}
+    if metric.name in {
         "success_rate",
         "successful_recommendation_round_ratio",
     }:
         scores = metric.evaluate_dialogues(dialogues, **utility_intents)
-    elif metric_name == "reward_per_dialogue_length":
+    elif metric.name == "reward_per_dialogue_length":
         scores = metric.evaluate_dialogues(
             dialogues,
             acceptance_intents=utility_intents["acceptance_intents"],
@@ -376,12 +329,23 @@ def main() -> None:
     config = load_config(args)
 
     metrics = config["metrics"].get()
-    validate_config(config)
     quality_aspects = config["quality_aspects"].get()
     dialogues = json_to_dialogues(config["dialogues"].get())
-    annotate_for_metrics(dialogues, config)
+    if config["annotate_dialogues"].get():
+        annotate_for_metrics(dialogues, config)
 
-    utility_intents = get_utility_intents(config)
+    utility_intents = {
+        "recommendation_intents": [
+            Intent(label)
+            for label in config["recommendation_intent_labels"].get()
+        ],
+        "acceptance_intents": [
+            Intent(label) for label in config["accept_intent_labels"].get()
+        ],
+        "rejection_intents": [
+            Intent(label) for label in config["reject_intent_labels"].get()
+        ],
+    }
     metric_registry = build_metric_registry(config, metrics)
 
     results: Dict[str, Any] = {
@@ -392,14 +356,13 @@ def main() -> None:
 
     for metric_name in metrics:
         results["metrics"][metric_name] = evaluate_metric(
-            metric_name,
             metric_registry[metric_name],
             dialogues,
-            quality_aspects,
-            utility_intents,
+            quality_aspects=quality_aspects,
+            utility_intents=utility_intents,
         )
 
-    output_dir = config["output"].get()
+    output_dir = config["output_dir"].get()
     output_stem, output_extension = os.path.splitext(output_dir)
     if output_extension:
         output_dir = output_stem
diff --git a/usersimcrs/utils/simulation_utils.py b/usersimcrs/utils/simulation_utils.py
index 003ffcae..ce388c61 100644
--- a/usersimcrs/utils/simulation_utils.py
+++ b/usersimcrs/utils/simulation_utils.py
@@ -188,11 +188,12 @@ def _get_agenda_based_simulator_config(
     }
 
 
-def get_NLU(config: confuse.Configuration) -> NLU:
+def get_NLU(config: confuse.Configuration, nlu_config_key: str = "nlu") -> NLU:
     """Returns an NLU component.
 
     Args:
         config: Configuration for the simulation.
+        nlu_config_key: Configuration key containing the NLU settings.
 
     Raises:
         ValueError: Unsupported intent classifier.
@@ -200,7 +201,7 @@ def get_NLU(config: confuse.Configuration) -> NLU:
     Returns:
         An NLU component.
     """
-    nlu_config = config["nlu"].get()
+    nlu_config = config[nlu_config_key].get()
     intent_classifier = nlu_config.get("type")
     if intent_classifier == "cosine":
         # NLU without slot annotators