From ceee927261261a3c65fe0cc471d6d5df9723a79c Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 17 Feb 2026 15:13:48 +0100 Subject: [PATCH 01/38] create base metric class --- .pre-commit-config.yaml | 6 +++--- scripts/evaluation/base_metric.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 scripts/evaluation/base_metric.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 93a59e68..f99f0c60 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,15 +1,15 @@ repos: - repo: https://github.com/ambv/black - rev: 22.6.0 + rev: 26.1.0 hooks: - id: black language_version: python3.11 - repo: https://github.com/pycqa/flake8 - rev: 5.0.4 + rev: 7.3.0 hooks: - id: flake8 - repo: https://github.com/PyCQA/docformatter - rev: v1.5.0 + rev: v1.7.7 hooks: - id: docformatter name: docformatter diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py new file mode 100644 index 00000000..5f4d2cc0 --- /dev/null +++ b/scripts/evaluation/base_metric.py @@ -0,0 +1,31 @@ +from abc import ABC, abstractmethod +from typing import Any + +from dialoguekit.core.dialogue import Dialogue + + +class BaseMetric(ABC): + """Abstract base class for dialogue evaluation metrics.""" + + def __init__(self) -> None: + """Initialize the metric.""" + pass + + @property + @abstractmethod + def name(self) -> str: + """Metric name (e.g., 'quality', 'satisfaction', 'utility').""" + pass + + @abstractmethod + def compute(self, dialogues: list[Dialogue], **kwargs: Any) -> Any: + """Compute the metric over the given dialogues. + + Args: + dialogues: List of dialogues to compute the metric on. + **kwargs: Additional arguments specific to the metric. + + Returns: + Metric scores. + """ + pass From 46b74566e26b4410a8f2a50311591e530a8daaea Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 17 Feb 2026 15:46:27 +0100 Subject: [PATCH 02/38] return pre commit config versions --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f99f0c60..93a59e68 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,15 +1,15 @@ repos: - repo: https://github.com/ambv/black - rev: 26.1.0 + rev: 22.6.0 hooks: - id: black language_version: python3.11 - repo: https://github.com/pycqa/flake8 - rev: 7.3.0 + rev: 5.0.4 hooks: - id: flake8 - repo: https://github.com/PyCQA/docformatter - rev: v1.7.7 + rev: v1.5.0 hooks: - id: docformatter name: docformatter From 8c96457638f0c8fb1a20d31845c84508f4f2f86d Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Mon, 23 Feb 2026 20:51:02 +0100 Subject: [PATCH 03/38] #233 add new classes --- scripts/__init__.py | 1 + scripts/evaluation/__init__.py | 3 + scripts/evaluation/quality_evaluation.py | 102 +----- scripts/evaluation/quality_metric.py | 208 ++++++++++++ scripts/evaluation/satisfaction_evaluation.py | 19 +- scripts/evaluation/satisfaction_metric.py | 73 +++++ scripts/evaluation/utility_evaluation.py | 245 +------------- scripts/evaluation/utility_metric.py | 310 ++++++++++++++++++ usersimcrs/nlu/llm/__init__.py | 12 +- usersimcrs/utils/simulation_utils.py | 9 +- 10 files changed, 623 insertions(+), 359 deletions(-) create mode 100644 scripts/__init__.py create mode 100644 scripts/evaluation/__init__.py create mode 100644 scripts/evaluation/quality_metric.py create mode 100644 scripts/evaluation/satisfaction_metric.py create mode 100644 scripts/evaluation/utility_metric.py diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 00000000..5100bd2d --- /dev/null +++ b/scripts/__init__.py @@ -0,0 +1 @@ +"""Scripts package marker to avoid namespace package ambiguity for mypy.""" diff --git a/scripts/evaluation/__init__.py b/scripts/evaluation/__init__.py new file mode 100644 index 00000000..ad40101c --- /dev/null +++ b/scripts/evaluation/__init__.py @@ -0,0 +1,3 @@ +"""Evaluation helpers package to make imports explicit for type checking.""" + +__all__: list[str] = [] diff --git a/scripts/evaluation/quality_evaluation.py b/scripts/evaluation/quality_evaluation.py index 3901ef41..162e1d56 100644 --- a/scripts/evaluation/quality_evaluation.py +++ b/scripts/evaluation/quality_evaluation.py @@ -14,54 +14,12 @@ import argparse import json import os -from collections import defaultdict -from dataclasses import dataclass from statistics import mean, stdev -from typing import Dict, List, Union +from typing import Dict, List -from tqdm import tqdm - -from dialoguekit.core.dialogue import Dialogue -from dialoguekit.participant.participant import DialogueParticipant from dialoguekit.utils.dialogue_reader import json_to_dialogues -from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics -from usersimcrs.llm_interfaces.ollama_interface import ( - OllamaLLMInterface, -) - -_PROMPT_EVAL_INTRO = ( - "You are an evaluator and you need to judge how does the " - "ASSISTANT perform based on the following CONVERSATION HISTORY. Please " - "rate the ASSISTANT's performance based on the following GRADING RUBRIC.\n" - "\nCONVERSATION HISTORY:" -) -_PROMPT_EVAL_OUTPUT_FORMAT = ( - 'Your output need be a be in a JSON format as follows:\n{"score": ' - ', "score_explanation": }\nDo not include ' - "additional information.\n" -) - - -@dataclass -class QualityScore: - conversation_id: str - score: int - explanation: str = "" - - def to_dict(self) -> Dict[str, Union[int, str]]: - """Converts the score to a dictionary.""" - return { - "conversation_id": self.conversation_id, - "score": self.score, - "score_explanation": self.explanation, - } - -class QualityScoreEncoder(json.JSONEncoder): - def default(self, o): - if isinstance(o, QualityScore): - return o.to_dict() - return super().default(o) +from scripts.evaluation.quality_metric import QualityMetric, QualityScoreEncoder def parse_args() -> argparse.Namespace: @@ -91,66 +49,14 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -def get_prompt(grading_rubric: QualityRubrics, dialogue: Dialogue) -> str: - """Prepares prompt given grading rubric and dialogue. - - Args: - grading_rubric: Grading rubric for the aspect. - dialogue: Dialogue. - - Returns: - Prompt comprising task definition, grading rubric, and dialogue. - """ - prompt = _PROMPT_EVAL_INTRO - - # Add dialogue history - for utterance in dialogue.utterances: - role = ( - "USER" - if utterance.participant == DialogueParticipant.USER - else "ASSISTANT" - ) - prompt += f"\n{role}: {utterance.text}" - - prompt += f"\n\nGRADING RUBRIC:\n{grading_rubric.value}\n" - prompt += _PROMPT_EVAL_OUTPUT_FORMAT - return prompt - - if __name__ == "__main__": args = parse_args() # Load dialogues dialogues = json_to_dialogues(args.dialogues) - # Ollama interface - ollama_interface = OllamaLLMInterface( - args.ollama_config, default_response="" - ) - - # Evaluate dialogues - scores: Dict[str, Dict[str, List[QualityScore]]] = defaultdict( - lambda: defaultdict(list) - ) - - for dialogue in tqdm(dialogues): - for aspect in QualityRubrics: - prompt = get_prompt(aspect, dialogue) - response = ollama_interface.get_llm_api_response(prompt) - try: - response = response.replace("\\", "\\\\") - response_dict = json.loads(response) - score = QualityScore( - conversation_id=dialogue.conversation_id, - score=int(response_dict["score"]), - explanation=response_dict["score_explanation"], - ) - scores[dialogue.agent_id][aspect.name].append(score) - except Exception as e: - print( - f"Failed to get score for {aspect} dialogue " - f"{dialogue.conversation_id}: {e}\nResponse: {response}" - ) + metric = QualityMetric(args.ollama_config) + scores: Dict[str, Dict[str, List]] = metric.compute(dialogues) # Save scores if args.output: diff --git a/scripts/evaluation/quality_metric.py b/scripts/evaluation/quality_metric.py new file mode 100644 index 00000000..298f1eb1 --- /dev/null +++ b/scripts/evaluation/quality_metric.py @@ -0,0 +1,208 @@ +"""Quality metric class implementation. + +Extracted from the original CLI script in `quality_evaluation.py`. +""" + +from collections import defaultdict +import json +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +from tqdm import tqdm + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.participant.participant import DialogueParticipant + +from scripts.evaluation.base_metric import BaseMetric +from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics +from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface + + +_PROMPT_EVAL_INTRO = ( + "You are an evaluator and you need to judge how does the " + "ASSISTANT perform based on the following CONVERSATION HISTORY. Please " + "rate the ASSISTANT's performance based on the following GRADING RUBRIC.\n" + "\nCONVERSATION HISTORY:" +) +_PROMPT_EVAL_OUTPUT_FORMAT = ( + 'Your output need be a be in a JSON format as follows:\n{"score": ' + ', "score_explanation": }\nDo not include ' + "additional information.\n" +) + + +@dataclass +class QualityScore: + conversation_id: str + score: int + explanation: str = "" + + def to_dict(self) -> Dict[str, Any]: + return { + "conversation_id": self.conversation_id, + "score": self.score, + "score_explanation": self.explanation, + } + + +class QualityScoreEncoder(json.JSONEncoder): + def default(self, o): + if isinstance(o, QualityScore): + return o.to_dict() + return super().default(o) + + +class QualityMetric(BaseMetric): + """Quality evaluation metric using an LLM backend. + + The class wraps the prompt construction and LLM calls and returns the + same structure previously produced by the CLI script: + + { agent_id: { aspect_name: [QualityScore, ...], ... }, ... } + """ + + def __init__( + self, + ollama_config_path: str, + default_response: str = "", + rubrics: Optional[List[QualityRubrics]] = None, + ) -> None: + super().__init__() + self.ollama_config_path = ollama_config_path + self.default_response = default_response + self.rubrics = rubrics or list(QualityRubrics) + + @property + def name(self) -> str: + return "quality" + + def _get_prompt( + self, grading_rubric: QualityRubrics, dialogue: Dialogue + ) -> str: + """Prepares prompt given grading rubric and dialogue. + + Args: + grading_rubric: Grading rubric for the aspect. + dialogue: Dialogue. + + Returns: + Prompt comprising task definition, grading rubric, and dialogue. + """ + prompt = _PROMPT_EVAL_INTRO + for utterance in dialogue.utterances: + role = ( + "USER" + if utterance.participant == DialogueParticipant.USER + else "ASSISTANT" + ) + prompt += f"\n{role}: {utterance.text}" + + prompt += f"\n\nGRADING RUBRIC:\n{grading_rubric.value}\n" + prompt += _PROMPT_EVAL_OUTPUT_FORMAT + return prompt + + def compute( + self, dialogues: List[Dialogue], aspects: Optional[List[str]] = None + ) -> Dict[str, Dict[str, List[QualityScore]]]: + """Compute quality scores for provided dialogues. + + Args: + dialogues: list of Dialogue objects + aspects: optional list of aspect names (strings) to evaluate + + Returns: + Nested dict: agent_id -> aspect_name -> list[QualityScore] + """ + ollama_interface = OllamaLLMInterface( + self.ollama_config_path, default_response=self.default_response + ) + + if aspects: + aspect_enums = [QualityRubrics[asp] for asp in aspects] + else: + aspect_enums = self.rubrics + + scores: Dict[str, Dict[str, List[QualityScore]]] = defaultdict( + lambda: defaultdict(list) + ) + + for dialogue in tqdm(dialogues): + for aspect in aspect_enums: + prompt = self._get_prompt(aspect, dialogue) + response = ollama_interface.get_llm_api_response(prompt) + try: + response = response.replace("\\", "\\\\") + response_dict = json.loads(response) + score = QualityScore( + conversation_id=dialogue.conversation_id, + score=int(response_dict["score"]), + explanation=response_dict.get("score_explanation", ""), + ) + scores[dialogue.agent_id][aspect.name].append(score) + except Exception: + print( + f"Failed to get score for {aspect} dialogue " + f"{dialogue.conversation_id}: {response}" + ) + + return scores + + +class RecommendationRelevanceMetric(QualityMetric): + """Quality metric that evaluates only recommendation relevance.""" + + def __init__(self, ollama_config_path: str, default_response: str = ""): + super().__init__(ollama_config_path, default_response=default_response) + self.rubrics = [QualityRubrics.REC_RELEVANCE] + + @property + def name(self) -> str: + return "quality.recommendation_relevance" + + +class CommunicationStyleMetric(QualityMetric): + """Quality metric that evaluates communication style.""" + + def __init__(self, ollama_config_path: str, default_response: str = ""): + super().__init__(ollama_config_path, default_response=default_response) + self.rubrics = [QualityRubrics.COM_STYLE] + + @property + def name(self) -> str: + return "quality.communication_style" + + +class FluencyMetric(QualityMetric): + """Quality metric that evaluates fluency.""" + + def __init__(self, ollama_config_path: str, default_response: str = ""): + super().__init__(ollama_config_path, default_response=default_response) + self.rubrics = [QualityRubrics.FLUENCY] + + @property + def name(self) -> str: + return "quality.fluency" + + +class ConversationalFlowMetric(QualityMetric): + """Quality metric that evaluates conversational flow.""" + + def __init__(self, ollama_config_path: str, default_response: str = ""): + super().__init__(ollama_config_path, default_response=default_response) + self.rubrics = [QualityRubrics.CONV_FLOW] + + @property + def name(self) -> str: + return "quality.conversational_flow" + + +class OverallSatisfactionQualityMetric(QualityMetric): + """Quality metric that evaluates overall satisfaction aspect.""" + + def __init__(self, ollama_config_path: str, default_response: str = ""): + super().__init__(ollama_config_path, default_response=default_response) + self.rubrics = [QualityRubrics.OVERALL_SAT] + + @property + def name(self) -> str: + return "quality.overall_satisfaction" diff --git a/scripts/evaluation/satisfaction_evaluation.py b/scripts/evaluation/satisfaction_evaluation.py index ea7dfb11..21fb8e00 100644 --- a/scripts/evaluation/satisfaction_evaluation.py +++ b/scripts/evaluation/satisfaction_evaluation.py @@ -5,14 +5,11 @@ """ import argparse -from collections import defaultdict from statistics import mean, stdev from typing import Dict -from dialoguekit.nlu.models.satisfaction_classifier import ( - SatisfactionClassifierSVM, -) from dialoguekit.utils.dialogue_reader import json_to_dialogues +from scripts.evaluation.satisfaction_metric import SatisfactionMetric def parse_args() -> argparse.Namespace: @@ -38,18 +35,8 @@ def parse_args() -> argparse.Namespace: dialogues = json_to_dialogues(args.dialogues) print(f"Loaded {len(dialogues)} dialogues.") - # Satisfaction classifier - satisfaction_classifier = SatisfactionClassifierSVM() - - # Evaluate dialogues - scores: Dict[str, Dict[int, float]] = defaultdict(dict) - - for i, dialogue in enumerate(dialogues): - scores[dialogue.agent_id][ - i - ] = satisfaction_classifier.classify_last_n_dialogue( - dialogue, last_n=None - ) + metric = SatisfactionMetric() + scores: Dict[str, Dict[int, float]] = metric.compute(dialogues) # Summary for agent, agent_scores in scores.items(): diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py new file mode 100644 index 00000000..8de9b8b1 --- /dev/null +++ b/scripts/evaluation/satisfaction_metric.py @@ -0,0 +1,73 @@ +"""Satisfaction metric class implementation. + +Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class. +""" + +from collections import defaultdict +from typing import Any, Dict, List, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from dialoguekit.core.dialogue import Dialogue # type: ignore + from dialoguekit.nlu.models.satisfaction_classifier import ( + SatisfactionClassifierSVM, + ) # type: ignore +else: + try: + from dialoguekit.core.dialogue import Dialogue + from dialoguekit.nlu.models.satisfaction_classifier import ( + SatisfactionClassifierSVM, + ) + except Exception: + Dialogue = Any + SatisfactionClassifierSVM = Any + +from scripts.evaluation.base_metric import BaseMetric + + +class SatisfactionMetric(BaseMetric): + """Wraps the `SatisfactionClassifierSVM` to compute satisfaction scores. + + Output format matches previous CLI script: { agent_id: { dialogue_index: + score, ... }, ... } + """ + + def __init__(self, classifier: Optional[SatisfactionClassifierSVM] = None): + super().__init__() + self.classifier = classifier or SatisfactionClassifierSVM() + + @property + def name(self) -> str: + return "satisfaction" + + def compute(self, dialogues: List[Dialogue]) -> Dict[str, Dict[int, float]]: + """Compute satisfaction scores for dialogues. + + Matches the previous CLI output format: agent_id -> dialogue_index -> + score + """ + scores: Dict[str, Dict[int, float]] = defaultdict(dict) + for i, dialogue in enumerate(dialogues): + scores[dialogue.agent_id][ + i + ] = self.classifier.classify_last_n_dialogue(dialogue, last_n=None) + return scores + + +class SatisfactionAverageMetric(SatisfactionMetric): + """Aggregates satisfaction scores and returns average per agent.""" + + @property + def name(self) -> str: + return "satisfaction.average" + + def compute(self, dialogues: List[Dialogue]) -> Dict[str, float]: + raw = super().compute(dialogues) + averages: Dict[str, float] = {} + for agent_id, agent_scores in raw.items(): + if len(agent_scores) == 0: + averages[agent_id] = 0.0 + else: + averages[agent_id] = sum(agent_scores.values()) / len( + agent_scores + ) + return averages diff --git a/scripts/evaluation/utility_evaluation.py b/scripts/evaluation/utility_evaluation.py index b97b2b50..ec898678 100644 --- a/scripts/evaluation/utility_evaluation.py +++ b/scripts/evaluation/utility_evaluation.py @@ -15,207 +15,10 @@ """ import argparse -from collections import defaultdict import json -from typing import Dict, List, Tuple -from confuse import Configuration -from tqdm import tqdm - -from dialoguekit.core.annotated_utterance import AnnotatedUtterance -from dialoguekit.core.dialogue import Dialogue -from dialoguekit.core.intent import Intent -from dialoguekit.nlu.nlu import NLU -from dialoguekit.participant.participant import DialogueParticipant from dialoguekit.utils.dialogue_reader import json_to_dialogues -from usersimcrs.utils.simulation_utils import get_NLU - - -def annotate_dialogue( - dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU -) -> Dialogue: - """Annotates utterances with dialogue acts. - - Args: - dialogue: Dialogue to be annotated. - user_nlu: User NLU module. - agent_nlu: Agent NLU module. - - Returns: - Annotated dialogue. - """ - for i, utterance in enumerate(dialogue.utterances): - if not isinstance(utterance, AnnotatedUtterance): - dialogue.utterances[i] = AnnotatedUtterance.from_utterance( - utterance - ) - - if len(utterance.dialogue_acts) > 0: - continue - - if utterance.participant == DialogueParticipant.USER: - dialogue.utterances[ - i - ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance) - elif utterance.participant == DialogueParticipant.AGENT: - dialogue.utterances[ - i - ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance) - else: - raise ValueError(f"Unknown participant: {utterance.participant}") - return dialogue - - -def annotate_dialogues( - dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU -) -> List[Dialogue]: - """Annotates dialogues with dialogue acts. - - Args: - dialogues: Dialogues. - user_nlu: User NLU module. - agent_nlu: Agent NLU module. - - Returns: - Annotated dialogues. - """ - # TODO: Move this to DialogueKit - # See: https://github.com/iai-group/UserSimCRS/issues/219 - return [ - annotate_dialogue(dialogue, user_nlu, agent_nlu) - for dialogue in tqdm(dialogues) - ] - - -def _get_recommendation_rounds( - dialogue: Dialogue, recommendation_intents: List[Intent] -) -> List[List[AnnotatedUtterance]]: - """Gets utterances per recommendation round. - - Args: - dialogue: Dialogue. - recommendation_intents: Intents corresponding to recommendation. - - Returns: - Utterances per recommendation round. - """ - rounds = [] - current_round: List[AnnotatedUtterance] = [] - for utterance in dialogue.utterances: - if any( - intent in utterance.get_intents() - for intent in recommendation_intents - ): - if current_round: - rounds.append(current_round) - current_round = [utterance] - else: - current_round.append(utterance) - return rounds - - -def _is_recommendation_accepted( - round: List[AnnotatedUtterance], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], -) -> bool: - """Assesses whether the recommendation was accepted. - - Args: - round: Utterances in recommendation round. - acceptance_intents: Intents corresponding to acceptance. - rejection_intents: Intents corresponding to rejection. - - Returns: - True if the recommendation was accepted, False otherwise. - """ - b_accepted = False - for utterance in round: - if utterance.participant == DialogueParticipant.USER: - intents = utterance.get_intents() - if any(intent in acceptance_intents for intent in intents): - b_accepted = True - elif any(intent in rejection_intents for intent in intents): - return False - return b_accepted - - -def assess_dialogue( - dialogue: Dialogue, - recommendation_intents: List[Intent], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], -) -> Tuple[int, int, int]: - """Assesses the utility of the dialogue. - - Args: - dialogue: Dialogue. - recommendation_intents: Intents corresponding to recommendation. - acceptance_intents: Intents corresponding to acceptance. - rejection_intents: Intents corresponding to rejection. - - Returns: - Tuple of number of accepted recommendations, successful recommendation - rounds and total recommendation rounds. - """ - # TODO: Optimize overall assessment to avoid multiple iterations over - # utterances. - rounds = _get_recommendation_rounds(dialogue, recommendation_intents) - successful_rounds = 0 - for round in rounds: - if _is_recommendation_accepted( - round, acceptance_intents, rejection_intents - ): - successful_rounds += 1 - - nb_accepted_recommendations = sum( - 1 - for utterance in dialogue.utterances - if utterance.participant == DialogueParticipant.USER - and any( - intent in acceptance_intents for intent in utterance.get_intents() - ) - ) - return nb_accepted_recommendations, successful_rounds, len(rounds) - - -def get_summary(dialogues: List[Dialogue]) -> None: - """Displays a summary of the utility evaluation. - - Args: - dialogues: Dialogues. - """ - summary: Dict[str, Dict[str, float]] = defaultdict( - lambda: { - "total_dialogues": 0, - "success_rate": 0, - "srrr": 0, - "rdl": 0, - } - ) - for dialogue in dialogues: - summary[dialogue.agent_id]["total_dialogues"] += 1 - summary[dialogue.agent_id]["success_rate"] += dialogue.metadata[ - "utility" - ]["success"] - summary[dialogue.agent_id]["srrr"] += dialogue.metadata["utility"][ - "successful_recommendation_round_ratio" - ] - summary[dialogue.agent_id]["rdl"] += dialogue.metadata["utility"][ - "reward_per_dialogue_length" - ] - - for agent_id, stats in summary.items(): - total = stats["total_dialogues"] - print(f"Agent: {agent_id}") - print(f"\tTotal Dialogues: {total}") - print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}") - print( - "\tSuccessful Recommendation Round Ratio: " - f"{stats['srrr'] / total:.4f}" - ) - print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}") - print() +from scripts.evaluation.utility_metric import UtilityMetric def parse_args() -> argparse.Namespace: @@ -271,43 +74,13 @@ def parse_args() -> argparse.Namespace: dialogues = json_to_dialogues(args.annotated_dialogues) - rejection_intents = [Intent(label) for label in args.reject_intent_labels] - acceptance_intents = [Intent(label) for label in args.accept_intent_labels] - recommendation_intents = [ - Intent(label) for label in args.recommendation_intent_labels - ] - - # NLU module for user utterances - user_nlu_config = Configuration("User NLU Configuration") - user_nlu_config.set_file(args.user_nlu_config) - user_nlu = get_NLU(user_nlu_config) - - # NLU module for agent utterances - agent_nlu_config = Configuration("Agent NLU Configuration") - agent_nlu_config.set_file(args.agent_nlu_config) - agent_nlu = get_NLU(agent_nlu_config) - - dialogues = annotate_dialogues(dialogues, user_nlu, agent_nlu) - for dialogue in dialogues: - ( - nb_accepted_recommendations, - successful_rounds, - total_rounds, - ) = assess_dialogue( - dialogue, - recommendation_intents, - acceptance_intents, - rejection_intents, - ) - dialogue.metadata["utility"] = { - "success": int(successful_rounds > 0), - "successful_recommendation_round_ratio": ( - successful_rounds / total_rounds if total_rounds > 0 else 0.0 - ), - "reward_per_dialogue_length": ( - nb_accepted_recommendations / len(dialogue.utterances) - ), - } + metric = UtilityMetric(args.user_nlu_config, args.agent_nlu_config) + dialogues = metric.compute( + dialogues, + recommendation_intent_labels=args.recommendation_intent_labels, + acceptance_intent_labels=args.accept_intent_labels, + rejection_intent_labels=args.reject_intent_labels, + ) if args.output: with open(args.output, "w") as f: @@ -315,4 +88,4 @@ def parse_args() -> argparse.Namespace: [dialogue.to_dict() for dialogue in dialogues], f, indent=2 ) - get_summary(dialogues) + metric.get_summary(dialogues) diff --git a/scripts/evaluation/utility_metric.py b/scripts/evaluation/utility_metric.py new file mode 100644 index 00000000..f59d92e6 --- /dev/null +++ b/scripts/evaluation/utility_metric.py @@ -0,0 +1,310 @@ +"""Utility metric class implementation. + +Encapsulates the logic from `utility_evaluation.py` into a `BaseMetric`. +""" + +from collections import defaultdict +from typing import Dict, List, Tuple + +from confuse import Configuration + +from dialoguekit.core.annotated_utterance import AnnotatedUtterance +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent +from dialoguekit.nlu.nlu import NLU +from dialoguekit.participant.participant import DialogueParticipant +from usersimcrs.utils.simulation_utils import get_NLU +from scripts.evaluation.base_metric import BaseMetric + + +def annotate_dialogue( + dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU +) -> Dialogue: + """Annotates utterances with dialogue acts. + + Args: + dialogue: Dialogue to be annotated. + user_nlu: User NLU module. + agent_nlu: Agent NLU module. + + Returns: + Annotated dialogue. + """ + for i, utterance in enumerate(dialogue.utterances): + if not isinstance(utterance, AnnotatedUtterance): + dialogue.utterances[i] = AnnotatedUtterance.from_utterance( + utterance + ) + + if len(utterance.dialogue_acts) > 0: + continue + + if utterance.participant == DialogueParticipant.USER: + dialogue.utterances[ + i + ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance) + elif utterance.participant == DialogueParticipant.AGENT: + dialogue.utterances[ + i + ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance) + else: + raise ValueError(f"Unknown participant: {utterance.participant}") + return dialogue + + +def annotate_dialogues( + dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU +) -> List[Dialogue]: + """Annotates dialogues with dialogue acts. + + Args: + dialogues: Dialogues. + user_nlu: User NLU module. + agent_nlu: Agent NLU module. + + Returns: + Annotated dialogues. + """ + # TODO: Move this to DialogueKit + # See: https://github.com/iai-group/UserSimCRS/issues/219 + return [ + annotate_dialogue(dialogue, user_nlu, agent_nlu) + for dialogue in dialogues + ] + + +def _get_recommendation_rounds( + dialogue: Dialogue, recommendation_intents: List[Intent] +) -> List[List[AnnotatedUtterance]]: + rounds: List[List[AnnotatedUtterance]] = [] + current_round: List[AnnotatedUtterance] = [] + for utterance in dialogue.utterances: + if any( + intent in utterance.get_intents() + for intent in recommendation_intents + ): + if current_round: + rounds.append(current_round) + current_round = [utterance] + else: + current_round.append(utterance) + return rounds + + +def _is_recommendation_accepted( + round: List[AnnotatedUtterance], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], +) -> bool: + b_accepted = False + for utterance in round: + if utterance.participant == DialogueParticipant.USER: + intents = utterance.get_intents() + if any(intent in acceptance_intents for intent in intents): + b_accepted = True + elif any(intent in rejection_intents for intent in intents): + return False + return b_accepted + + +def assess_dialogue( + dialogue: Dialogue, + recommendation_intents: List[Intent], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], +) -> Tuple[int, int, int]: + """Assesses the utility of the dialogue. + + Args: + dialogue: Dialogue. + recommendation_intents: Intents corresponding to recommendation. + acceptance_intents: Intents corresponding to acceptance. + rejection_intents: Intents corresponding to rejection. + + Returns: + Tuple of number of accepted recommendations, successful recommendation + rounds and total recommendation rounds. + """ + # TODO: Optimize overall assessment to avoid multiple iterations over + # utterances. + rounds = _get_recommendation_rounds(dialogue, recommendation_intents) + successful_rounds = 0 + for round in rounds: + if _is_recommendation_accepted( + round, acceptance_intents, rejection_intents + ): + successful_rounds += 1 + + nb_accepted_recommendations = sum( + 1 + for utterance in dialogue.utterances + if utterance.participant == DialogueParticipant.USER + and any( + intent in acceptance_intents for intent in utterance.get_intents() + ) + ) + return nb_accepted_recommendations, successful_rounds, len(rounds) + + +class UtilityMetric(BaseMetric): + """Computes utility metrics for dialogues. + + Constructor takes paths to user and agent NLU configuration files. + """ + + def __init__(self, user_nlu_config_path: str, agent_nlu_config_path: str): + super().__init__() + self.user_nlu_config_path = user_nlu_config_path + self.agent_nlu_config_path = agent_nlu_config_path + + @property + def name(self) -> str: + return "utility" + + def _load_nlus(self) -> Tuple[NLU, NLU]: + user_nlu_config = Configuration("User NLU Configuration") + user_nlu_config.set_file(self.user_nlu_config_path) + user_nlu = get_NLU(user_nlu_config) + + agent_nlu_config = Configuration("Agent NLU Configuration") + agent_nlu_config.set_file(self.agent_nlu_config_path) + agent_nlu = get_NLU(agent_nlu_config) + + return user_nlu, agent_nlu + + def compute( + self, + dialogues: List[Dialogue], + recommendation_intent_labels: List[str] = ["REC-S", "REC-E"], + acceptance_intent_labels: List[str] = ["ACC"], + rejection_intent_labels: List[str] = ["REJ"], + ) -> List[Dialogue]: + user_nlu, agent_nlu = self._load_nlus() + + dialogues = annotate_dialogues(dialogues, user_nlu, agent_nlu) + + recommendation_intents = [ + Intent(label) for label in recommendation_intent_labels + ] + acceptance_intents = [ + Intent(label) for label in acceptance_intent_labels + ] + rejection_intents = [Intent(label) for label in rejection_intent_labels] + + for dialogue in dialogues: + ( + nb_accepted_recommendations, + successful_rounds, + total_rounds, + ) = assess_dialogue( + dialogue, + recommendation_intents, + acceptance_intents, + rejection_intents, + ) + dialogue.metadata["utility"] = { + "success": int(successful_rounds > 0), + "successful_recommendation_round_ratio": ( + successful_rounds / total_rounds + if total_rounds > 0 + else 0.0 + ), + "reward_per_dialogue_length": ( + nb_accepted_recommendations / len(dialogue.utterances) + if len(dialogue.utterances) > 0 + else 0.0 + ), + } + + return dialogues + + def get_summary(self, dialogues: List[Dialogue]) -> None: + summary: Dict[str, Dict[str, float]] = defaultdict( + lambda: { + "total_dialogues": 0, + "success_rate": 0, + "srrr": 0, + "rdl": 0, + } + ) + for dialogue in dialogues: + summary[dialogue.agent_id]["total_dialogues"] += 1 + summary[dialogue.agent_id]["success_rate"] += dialogue.metadata[ + "utility" + ]["success"] + summary[dialogue.agent_id]["srrr"] += dialogue.metadata["utility"][ + "successful_recommendation_round_ratio" + ] + summary[dialogue.agent_id]["rdl"] += dialogue.metadata["utility"][ + "reward_per_dialogue_length" + ] + + for agent_id, stats in summary.items(): + total = stats["total_dialogues"] + print(f"Agent: {agent_id}") + print(f"\tTotal Dialogues: {total}") + print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}") + print( + "\tSuccessful Recommendation Round Ratio: " + f"{stats['srrr'] / total:.4f}" + ) + print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}") + print() + + +class UtilitySuccessMetric(UtilityMetric): + """Extracts per-dialogue success flag from utility analysis.""" + + @property + def name(self) -> str: + return "utility.success" + + def compute(self, dialogues: List[Dialogue], *args, **kwargs): + dialogues = super().compute(dialogues, *args, **kwargs) + + results: Dict[str, Dict[int, int]] = defaultdict(dict) + for i, dialogue in enumerate(dialogues): + results[dialogue.agent_id][i] = int( + dialogue.metadata.get("utility", {}).get("success", 0) + ) + return results + + +class UtilitySRRRMetric(UtilityMetric): + """Extracts successful recommendation round ratio per dialogue.""" + + @property + def name(self) -> str: + return "utility.successful_recommendation_round_ratio" + + def compute(self, dialogues: List[Dialogue], *args, **kwargs): + dialogues = super().compute(dialogues, *args, **kwargs) + + results: Dict[str, Dict[int, float]] = defaultdict(dict) + for i, dialogue in enumerate(dialogues): + results[dialogue.agent_id][i] = float( + dialogue.metadata.get("utility", {}).get( + "successful_recommendation_round_ratio", 0.0 + ) + ) + return results + + +class UtilityRDLMetric(UtilityMetric): + """Extracts reward-per-dialogue-length per dialogue.""" + + @property + def name(self) -> str: + return "utility.reward_per_dialogue_length" + + def compute(self, dialogues: List[Dialogue], *args, **kwargs): + dialogues = super().compute(dialogues, *args, **kwargs) + + results: Dict[str, Dict[int, float]] = defaultdict(dict) + for i, dialogue in enumerate(dialogues): + results[dialogue.agent_id][i] = float( + dialogue.metadata.get("utility", {}).get( + "reward_per_dialogue_length", 0.0 + ) + ) + return results diff --git a/usersimcrs/nlu/llm/__init__.py b/usersimcrs/nlu/llm/__init__.py index be592d99..3c608547 100644 --- a/usersimcrs/nlu/llm/__init__.py +++ b/usersimcrs/nlu/llm/__init__.py @@ -1,9 +1,9 @@ """Module level init for LLM-based NLU components.""" -from usersimcrs.nlu.llm.llm_dialogue_act_extractor import ( - LLMDialogueActsExtractor, -) +"""Module level init for LLM-based NLU components. -__all__ = [ - "LLMDialogueActsExtractor", -] +Avoid importing heavy submodules at package import time to keep test +collection lightweight; import submodules explicitly when needed. +""" + +__all__ = ["LLMDialogueActsExtractor"] diff --git a/usersimcrs/utils/simulation_utils.py b/usersimcrs/utils/simulation_utils.py index 6121723e..b0ed0c9f 100644 --- a/usersimcrs/utils/simulation_utils.py +++ b/usersimcrs/utils/simulation_utils.py @@ -142,9 +142,12 @@ def _get_agenda_based_simulator_config( ratings = Ratings(item_collection) ratings.load_ratings_csv(file_path=config["ratings"].get()) - historical_ratings, _ = ratings.create_split( - config["historical_ratings_ratio"].get(0.8) - ) + raw = config["historical_ratings_ratio"].get() + if raw is None: + historical_ratio = 0.8 + else: + historical_ratio = float(raw) + historical_ratings, _ = ratings.create_split(historical_ratio) preference_model = SimplePreferenceModel( domain, From 950964c1533d96f05282536b6c868d641a72ee7f Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Mon, 23 Feb 2026 22:55:25 +0100 Subject: [PATCH 04/38] #232 fix class --- scripts/evaluation/base_metric.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py index 5f4d2cc0..71edfac0 100644 --- a/scripts/evaluation/base_metric.py +++ b/scripts/evaluation/base_metric.py @@ -1,31 +1,35 @@ +"""Abstract base class for dialogue evaluation metrics.""" + from abc import ABC, abstractmethod -from typing import Any +from typing import Any, List from dialoguekit.core.dialogue import Dialogue class BaseMetric(ABC): - """Abstract base class for dialogue evaluation metrics.""" + def __init__(self, name: str) -> None: + """Initializes the metric. - def __init__(self) -> None: - """Initialize the metric.""" - pass + Args: + name: Metric name (e.g., 'quality', 'satisfaction', 'utility'). + """ + super().__init__() + self._name = name @property - @abstractmethod def name(self) -> str: """Metric name (e.g., 'quality', 'satisfaction', 'utility').""" - pass + return self._name @abstractmethod - def compute(self, dialogues: list[Dialogue], **kwargs: Any) -> Any: - """Compute the metric over the given dialogues. + def compute(self, dialogues: List[Dialogue], **kwargs: Any) -> Any: + """Computes the metric over the given dialogues. Args: dialogues: List of dialogues to compute the metric on. **kwargs: Additional arguments specific to the metric. Returns: - Metric scores. + Metric result; shape is defined by the concrete metric. """ - pass + raise NotImplementedError() From 33624b20890726e2b1b097eb7a232659bb5d5d7e Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 24 Feb 2026 10:24:24 +0100 Subject: [PATCH 05/38] #232 fix class --- scripts/evaluation/base_metric.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py index 71edfac0..4bd6ee4d 100644 --- a/scripts/evaluation/base_metric.py +++ b/scripts/evaluation/base_metric.py @@ -7,19 +7,13 @@ class BaseMetric(ABC): - def __init__(self, name: str) -> None: + def __init__(self, name: str): """Initializes the metric. Args: name: Metric name (e.g., 'quality', 'satisfaction', 'utility'). """ - super().__init__() - self._name = name - - @property - def name(self) -> str: - """Metric name (e.g., 'quality', 'satisfaction', 'utility').""" - return self._name + self.name = name @abstractmethod def compute(self, dialogues: List[Dialogue], **kwargs: Any) -> Any: From 27f788866d0c5346e79be549b3123834011d7144 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 24 Feb 2026 11:25:50 +0100 Subject: [PATCH 06/38] #232 add aggregation --- scripts/evaluation/base_metric.py | 61 ++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py index 4bd6ee4d..823f33c0 100644 --- a/scripts/evaluation/base_metric.py +++ b/scripts/evaluation/base_metric.py @@ -1,29 +1,72 @@ -"""Abstract base class for dialogue evaluation metrics.""" +"""Abstract base class for dialogue evaluation metrics. -from abc import ABC, abstractmethod -from typing import Any, List +Subclasses implement only compute_score(dialogue, **kwargs). The base class +provides aggregation at three levels: per dialogue, per dialogues, and per +agent. +""" +from abc import ABC, abstractmethod +from typing import Any, Dict from dialoguekit.core.dialogue import Dialogue class BaseMetric(ABC): - def __init__(self, name: str): + def __init__(self, name: str) -> None: """Initializes the metric. Args: - name: Metric name (e.g., 'quality', 'satisfaction', 'utility'). + name: Metric name. """ self.name = name @abstractmethod - def compute(self, dialogues: List[Dialogue], **kwargs: Any) -> Any: - """Computes the metric over the given dialogues. + def compute_score(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the metric for a single dialogue. + + Subclasses must implement this method. Args: - dialogues: List of dialogues to compute the metric on. + dialogue: Single dialogue to score. **kwargs: Additional arguments specific to the metric. Returns: - Metric result; shape is defined by the concrete metric. + Score for the dialogue. + + Raises: + NotImplementedError: When not implemented by a subclass. """ raise NotImplementedError() + + def compute_scores_for_dialogues( + self, dialogues: Dict[str, Dialogue], **kwargs: Any + ) -> Dict[str, float]: + """Computes the metric for each dialogue in a dict of dialogues. + + Args: + dialogues: Dict of dialogues + **kwargs: Passed through to compute_score. + + Returns: + Dict of scores per dialogue. + """ + return { + dialog_id: self.compute_score(dialogue, **kwargs) + for dialog_id, dialogue in dialogues.items() + } + + def compute_scores_per_agent( + self, dialogues_by_agent: Dict[str, Dict[str, Dialogue]], **kwargs: Any + ) -> Dict[str, Dict[str, float]]: + """Computes the metric per agent over their dialogues. + + Args: + dialogues_by_agent: Dict of dialogues per agent. + **kwargs: Passed through to compute_score. + + Returns: + Dict of scores per agent. + """ + return { + agent_id: self.compute_scores_for_dialogues(dialogues, **kwargs) + for agent_id, dialogues in dialogues_by_agent.items() + } From c38331a10cf5be336f8b30ea345f5c1addd91862 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 24 Feb 2026 13:17:33 +0100 Subject: [PATCH 07/38] #232 fix methods --- scripts/evaluation/base_metric.py | 55 +++++++++++++++---------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py index 823f33c0..0ba09c61 100644 --- a/scripts/evaluation/base_metric.py +++ b/scripts/evaluation/base_metric.py @@ -1,12 +1,8 @@ -"""Abstract base class for dialogue evaluation metrics. - -Subclasses implement only compute_score(dialogue, **kwargs). The base class -provides aggregation at three levels: per dialogue, per dialogues, and per -agent. -""" +"""Abstract base class for dialogue evaluation metrics.""" from abc import ABC, abstractmethod -from typing import Any, Dict +from collections import defaultdict +from typing import Any, Dict, List from dialoguekit.core.dialogue import Dialogue @@ -20,53 +16,54 @@ def __init__(self, name: str) -> None: self.name = name @abstractmethod - def compute_score(self, dialogue: Dialogue, **kwargs: Any) -> float: + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: """Computes the metric for a single dialogue. - Subclasses must implement this method. - Args: dialogue: Single dialogue to score. **kwargs: Additional arguments specific to the metric. - Returns: - Score for the dialogue. - Raises: NotImplementedError: When not implemented by a subclass. + + Returns: + Score for the dialogue. """ raise NotImplementedError() - def compute_scores_for_dialogues( - self, dialogues: Dict[str, Dialogue], **kwargs: Any + def evaluate_dialogues( + self, dialogues: List[Dialogue], **kwargs: Any ) -> Dict[str, float]: - """Computes the metric for each dialogue in a dict of dialogues. + """Computes the metric for every dialogue in a given list. Args: - dialogues: Dict of dialogues - **kwargs: Passed through to compute_score. + dialogues: Dialogues. + **kwargs: Additional arguments specific to the metric. Returns: - Dict of scores per dialogue. + Dictionary with result per dialogue. """ return { - dialog_id: self.compute_score(dialogue, **kwargs) - for dialog_id, dialogue in dialogues.items() + dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs) + for dialogue in dialogues } - def compute_scores_per_agent( - self, dialogues_by_agent: Dict[str, Dict[str, Dialogue]], **kwargs: Any + def evaluate_agents( + self, dialogues: List[Dialogue], **kwargs: Any ) -> Dict[str, Dict[str, float]]: - """Computes the metric per agent over their dialogues. + """Computes the metric for every agent in a given list. Args: - dialogues_by_agent: Dict of dialogues per agent. - **kwargs: Passed through to compute_score. + dialogues: Dialogues. + **kwargs: Additional arguments specific to the metric. Returns: - Dict of scores per agent. + Dictionary with result per agent. """ + dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list) + for dialogue in dialogues: + dialogues_by_agent[dialogue.agent_id].append(dialogue) return { - agent_id: self.compute_scores_for_dialogues(dialogues, **kwargs) - for agent_id, dialogues in dialogues_by_agent.items() + agent_id: self.evaluate_dialogues(agent_dialogues, **kwargs) + for agent_id, agent_dialogues in dialogues_by_agent.items() } From 901dd5a7dbd96e34fd4bccfa840d50c634a22d10 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 24 Feb 2026 13:40:00 +0100 Subject: [PATCH 08/38] #232 fix nits --- scripts/evaluation/base_metric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py index 0ba09c61..f08a3a05 100644 --- a/scripts/evaluation/base_metric.py +++ b/scripts/evaluation/base_metric.py @@ -6,7 +6,7 @@ from dialoguekit.core.dialogue import Dialogue -class BaseMetric(ABC): +class Metric(ABC): def __init__(self, name: str) -> None: """Initializes the metric. @@ -51,7 +51,7 @@ def evaluate_dialogues( def evaluate_agents( self, dialogues: List[Dialogue], **kwargs: Any ) -> Dict[str, Dict[str, float]]: - """Computes the metric for every agent in a given list. + """Computes the metric for every agent over their dialogues. Args: dialogues: Dialogues. From c4efd7994bd7ad44e68f0a2449a96b8f398c748c Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 11:30:22 +0100 Subject: [PATCH 09/38] improvement/233-create-classes-for-metrics add classes --- scripts/evaluation/base_metric.py | 58 +- scripts/evaluation/quality_evaluation.py | 20 +- scripts/evaluation/quality_metric.py | 174 ++---- scripts/evaluation/satisfaction_evaluation.py | 12 +- scripts/evaluation/satisfaction_metric.py | 103 ++-- scripts/evaluation/utility_evaluation.py | 45 +- scripts/evaluation/utility_metric.py | 542 +++++++++--------- tests/evaluation/test_quality_metric.py | 83 +++ tests/evaluation/test_satisfaction_metric.py | 68 +++ tests/evaluation/test_utility_metric.py | 77 +++ 10 files changed, 673 insertions(+), 509 deletions(-) create mode 100644 tests/evaluation/test_quality_metric.py create mode 100644 tests/evaluation/test_satisfaction_metric.py create mode 100644 tests/evaluation/test_utility_metric.py diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py index 4bd6ee4d..f08a3a05 100644 --- a/scripts/evaluation/base_metric.py +++ b/scripts/evaluation/base_metric.py @@ -1,29 +1,69 @@ """Abstract base class for dialogue evaluation metrics.""" from abc import ABC, abstractmethod -from typing import Any, List - +from collections import defaultdict +from typing import Any, Dict, List from dialoguekit.core.dialogue import Dialogue -class BaseMetric(ABC): - def __init__(self, name: str): +class Metric(ABC): + def __init__(self, name: str) -> None: """Initializes the metric. Args: - name: Metric name (e.g., 'quality', 'satisfaction', 'utility'). + name: Metric name. """ self.name = name @abstractmethod - def compute(self, dialogues: List[Dialogue], **kwargs: Any) -> Any: - """Computes the metric over the given dialogues. + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the metric for a single dialogue. Args: - dialogues: List of dialogues to compute the metric on. + dialogue: Single dialogue to score. **kwargs: Additional arguments specific to the metric. + Raises: + NotImplementedError: When not implemented by a subclass. + Returns: - Metric result; shape is defined by the concrete metric. + Score for the dialogue. """ raise NotImplementedError() + + def evaluate_dialogues( + self, dialogues: List[Dialogue], **kwargs: Any + ) -> Dict[str, float]: + """Computes the metric for every dialogue in a given list. + + Args: + dialogues: Dialogues. + **kwargs: Additional arguments specific to the metric. + + Returns: + Dictionary with result per dialogue. + """ + return { + dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs) + for dialogue in dialogues + } + + def evaluate_agents( + self, dialogues: List[Dialogue], **kwargs: Any + ) -> Dict[str, Dict[str, float]]: + """Computes the metric for every agent over their dialogues. + + Args: + dialogues: Dialogues. + **kwargs: Additional arguments specific to the metric. + + Returns: + Dictionary with result per agent. + """ + dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list) + for dialogue in dialogues: + dialogues_by_agent[dialogue.agent_id].append(dialogue) + return { + agent_id: self.evaluate_dialogues(agent_dialogues, **kwargs) + for agent_id, agent_dialogues in dialogues_by_agent.items() + } diff --git a/scripts/evaluation/quality_evaluation.py b/scripts/evaluation/quality_evaluation.py index 162e1d56..082adde3 100644 --- a/scripts/evaluation/quality_evaluation.py +++ b/scripts/evaluation/quality_evaluation.py @@ -15,11 +15,10 @@ import json import os from statistics import mean, stdev -from typing import Dict, List from dialoguekit.utils.dialogue_reader import json_to_dialogues -from scripts.evaluation.quality_metric import QualityMetric, QualityScoreEncoder +from scripts.evaluation.quality_metric import QualityMetric def parse_args() -> argparse.Namespace: @@ -56,19 +55,18 @@ def parse_args() -> argparse.Namespace: dialogues = json_to_dialogues(args.dialogues) metric = QualityMetric(args.ollama_config) - scores: Dict[str, Dict[str, List]] = metric.compute(dialogues) + scores = metric.evaluate_agents(dialogues) - # Save scores + # Save scores (agent_id -> conversation_id -> score) if args.output: - os.makedirs(os.path.dirname(args.output), exist_ok=True) + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) with open(args.output, "w") as f: - json.dump(scores, f, indent=2, cls=QualityScoreEncoder) + json.dump(scores, f, indent=2) # Summary for agent_id, agent_scores in scores.items(): + score_values = list(agent_scores.values()) print(f"Scores for agent {agent_id}:") - for aspect_name, aspect_scores in agent_scores.items(): - print(f"Aspect: {aspect_name}") - avg_score = mean([score.score for score in aspect_scores]) - std_dev = stdev([score.score for score in aspect_scores]) - print(f"Average score: {avg_score:.2f} (std dev: {std_dev:.2f})") + avg_score = mean(score_values) + std_dev = stdev(score_values) if len(score_values) >= 2 else 0.0 + print(f"Average score: {avg_score:.2f} (std dev: {std_dev:.2f})") diff --git a/scripts/evaluation/quality_metric.py b/scripts/evaluation/quality_metric.py index 298f1eb1..44b78ed5 100644 --- a/scripts/evaluation/quality_metric.py +++ b/scripts/evaluation/quality_metric.py @@ -3,17 +3,14 @@ Extracted from the original CLI script in `quality_evaluation.py`. """ -from collections import defaultdict import json -from dataclasses import dataclass -from typing import Any, Dict, List, Optional - -from tqdm import tqdm +from statistics import mean +from typing import Any, List, Optional from dialoguekit.core.dialogue import Dialogue from dialoguekit.participant.participant import DialogueParticipant -from scripts.evaluation.base_metric import BaseMetric +from scripts.evaluation.base_metric import Metric from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface @@ -31,34 +28,10 @@ ) -@dataclass -class QualityScore: - conversation_id: str - score: int - explanation: str = "" - - def to_dict(self) -> Dict[str, Any]: - return { - "conversation_id": self.conversation_id, - "score": self.score, - "score_explanation": self.explanation, - } - - -class QualityScoreEncoder(json.JSONEncoder): - def default(self, o): - if isinstance(o, QualityScore): - return o.to_dict() - return super().default(o) - - -class QualityMetric(BaseMetric): +class QualityMetric(Metric): """Quality evaluation metric using an LLM backend. - The class wraps the prompt construction and LLM calls and returns the - same structure previously produced by the CLI script: - - { agent_id: { aspect_name: [QualityScore, ...], ... }, ... } + Returns scores as floats (average across aspects per dialogue). """ def __init__( @@ -66,15 +39,22 @@ def __init__( ollama_config_path: str, default_response: str = "", rubrics: Optional[List[QualityRubrics]] = None, + name: str = "quality", ) -> None: - super().__init__() + super().__init__(name) self.ollama_config_path = ollama_config_path self.default_response = default_response self.rubrics = rubrics or list(QualityRubrics) - - @property - def name(self) -> str: - return "quality" + self._ollama_interface: Optional[OllamaLLMInterface] = None + + def _get_ollama_interface(self) -> OllamaLLMInterface: + """Returns (cached) Ollama LLM interface.""" + if self._ollama_interface is None: + self._ollama_interface = OllamaLLMInterface( + self.ollama_config_path, + default_response=self.default_response, + ) + return self._ollama_interface def _get_prompt( self, grading_rubric: QualityRubrics, dialogue: Dialogue @@ -101,108 +81,28 @@ def _get_prompt( prompt += _PROMPT_EVAL_OUTPUT_FORMAT return prompt - def compute( - self, dialogues: List[Dialogue], aspects: Optional[List[str]] = None - ) -> Dict[str, Dict[str, List[QualityScore]]]: - """Compute quality scores for provided dialogues. - - Args: - dialogues: list of Dialogue objects - aspects: optional list of aspect names (strings) to evaluate - - Returns: - Nested dict: agent_id -> aspect_name -> list[QualityScore] - """ - ollama_interface = OllamaLLMInterface( - self.ollama_config_path, default_response=self.default_response - ) - + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Returns average score across aspects for a single dialogue (1–5).""" + aspects = kwargs.get("aspects") if aspects: aspect_enums = [QualityRubrics[asp] for asp in aspects] else: aspect_enums = self.rubrics - scores: Dict[str, Dict[str, List[QualityScore]]] = defaultdict( - lambda: defaultdict(list) - ) - - for dialogue in tqdm(dialogues): - for aspect in aspect_enums: - prompt = self._get_prompt(aspect, dialogue) - response = ollama_interface.get_llm_api_response(prompt) - try: - response = response.replace("\\", "\\\\") - response_dict = json.loads(response) - score = QualityScore( - conversation_id=dialogue.conversation_id, - score=int(response_dict["score"]), - explanation=response_dict.get("score_explanation", ""), - ) - scores[dialogue.agent_id][aspect.name].append(score) - except Exception: - print( - f"Failed to get score for {aspect} dialogue " - f"{dialogue.conversation_id}: {response}" - ) - - return scores - - -class RecommendationRelevanceMetric(QualityMetric): - """Quality metric that evaluates only recommendation relevance.""" - - def __init__(self, ollama_config_path: str, default_response: str = ""): - super().__init__(ollama_config_path, default_response=default_response) - self.rubrics = [QualityRubrics.REC_RELEVANCE] - - @property - def name(self) -> str: - return "quality.recommendation_relevance" - - -class CommunicationStyleMetric(QualityMetric): - """Quality metric that evaluates communication style.""" - - def __init__(self, ollama_config_path: str, default_response: str = ""): - super().__init__(ollama_config_path, default_response=default_response) - self.rubrics = [QualityRubrics.COM_STYLE] - - @property - def name(self) -> str: - return "quality.communication_style" - - -class FluencyMetric(QualityMetric): - """Quality metric that evaluates fluency.""" - - def __init__(self, ollama_config_path: str, default_response: str = ""): - super().__init__(ollama_config_path, default_response=default_response) - self.rubrics = [QualityRubrics.FLUENCY] - - @property - def name(self) -> str: - return "quality.fluency" - - -class ConversationalFlowMetric(QualityMetric): - """Quality metric that evaluates conversational flow.""" - - def __init__(self, ollama_config_path: str, default_response: str = ""): - super().__init__(ollama_config_path, default_response=default_response) - self.rubrics = [QualityRubrics.CONV_FLOW] - - @property - def name(self) -> str: - return "quality.conversational_flow" - - -class OverallSatisfactionQualityMetric(QualityMetric): - """Quality metric that evaluates overall satisfaction aspect.""" - - def __init__(self, ollama_config_path: str, default_response: str = ""): - super().__init__(ollama_config_path, default_response=default_response) - self.rubrics = [QualityRubrics.OVERALL_SAT] - - @property - def name(self) -> str: - return "quality.overall_satisfaction" + ollama_interface = self._get_ollama_interface() + scores: List[float] = [] + + for aspect in aspect_enums: + prompt = self._get_prompt(aspect, dialogue) + response = ollama_interface.get_llm_api_response(prompt) + try: + response = response.replace("\\", "\\\\") + response_dict = json.loads(response) + scores.append(int(response_dict["score"])) + except Exception: + print( + f"Failed to get score for {aspect} dialogue " + f"{dialogue.conversation_id}: {response}" + ) + + return mean(scores) if scores else 0.0 diff --git a/scripts/evaluation/satisfaction_evaluation.py b/scripts/evaluation/satisfaction_evaluation.py index 21fb8e00..4c2d1890 100644 --- a/scripts/evaluation/satisfaction_evaluation.py +++ b/scripts/evaluation/satisfaction_evaluation.py @@ -5,8 +5,6 @@ """ import argparse -from statistics import mean, stdev -from typing import Dict from dialoguekit.utils.dialogue_reader import json_to_dialogues from scripts.evaluation.satisfaction_metric import SatisfactionMetric @@ -36,14 +34,14 @@ def parse_args() -> argparse.Namespace: print(f"Loaded {len(dialogues)} dialogues.") metric = SatisfactionMetric() - scores: Dict[str, Dict[int, float]] = metric.compute(dialogues) + scores = metric.evaluate_agents(dialogues) # Summary for agent, agent_scores in scores.items(): - avg_score = mean(agent_scores.values()) - stdev_score = stdev(agent_scores.values()) - max_score = max(agent_scores.values()) - min_score = min(agent_scores.values()) + avg_score = metric.get_average(agent_scores) + stdev_score = metric.get_stdev(agent_scores) + max_score = metric.get_max(agent_scores) + min_score = metric.get_min(agent_scores) print(f"Agent: {agent} / Num. dialogues: {len(agent_scores)}") print(f"Min score: {min_score}") print(f"Max score: {max_score}") diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py index 8de9b8b1..e7125696 100644 --- a/scripts/evaluation/satisfaction_metric.py +++ b/scripts/evaluation/satisfaction_metric.py @@ -1,73 +1,52 @@ """Satisfaction metric class implementation. -Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class. +Wraps DialogueKit's satisfaction classifier into a `Metric` class. """ -from collections import defaultdict -from typing import Any, Dict, List, Optional, TYPE_CHECKING +from statistics import mean, stdev +from typing import Any, Dict, Optional -if TYPE_CHECKING: - from dialoguekit.core.dialogue import Dialogue # type: ignore - from dialoguekit.nlu.models.satisfaction_classifier import ( - SatisfactionClassifierSVM, - ) # type: ignore -else: - try: - from dialoguekit.core.dialogue import Dialogue - from dialoguekit.nlu.models.satisfaction_classifier import ( - SatisfactionClassifierSVM, - ) - except Exception: - Dialogue = Any - SatisfactionClassifierSVM = Any +from dialoguekit.core.dialogue import Dialogue # type: ignore +from dialoguekit.nlu.models.satisfaction_classifier import ( + SatisfactionClassifierSVM, +) -from scripts.evaluation.base_metric import BaseMetric +from scripts.evaluation.base_metric import Metric -class SatisfactionMetric(BaseMetric): - """Wraps the `SatisfactionClassifierSVM` to compute satisfaction scores. +class SatisfactionMetric(Metric): + """Wraps the `SatisfactionClassifierSVM` to compute satisfaction scores.""" - Output format matches previous CLI script: { agent_id: { dialogue_index: - score, ... }, ... } - """ - - def __init__(self, classifier: Optional[SatisfactionClassifierSVM] = None): - super().__init__() + def __init__( + self, + classifier: Optional[SatisfactionClassifierSVM] = None, + name: str = "satisfaction", + ): + super().__init__(name) self.classifier = classifier or SatisfactionClassifierSVM() - @property - def name(self) -> str: - return "satisfaction" - - def compute(self, dialogues: List[Dialogue]) -> Dict[str, Dict[int, float]]: - """Compute satisfaction scores for dialogues. - - Matches the previous CLI output format: agent_id -> dialogue_index -> - score - """ - scores: Dict[str, Dict[int, float]] = defaultdict(dict) - for i, dialogue in enumerate(dialogues): - scores[dialogue.agent_id][ - i - ] = self.classifier.classify_last_n_dialogue(dialogue, last_n=None) - return scores - - -class SatisfactionAverageMetric(SatisfactionMetric): - """Aggregates satisfaction scores and returns average per agent.""" - - @property - def name(self) -> str: - return "satisfaction.average" - - def compute(self, dialogues: List[Dialogue]) -> Dict[str, float]: - raw = super().compute(dialogues) - averages: Dict[str, float] = {} - for agent_id, agent_scores in raw.items(): - if len(agent_scores) == 0: - averages[agent_id] = 0.0 - else: - averages[agent_id] = sum(agent_scores.values()) / len( - agent_scores - ) - return averages + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the satisfaction score for a single dialogue.""" + return self.classifier.classify_last_n_dialogue(dialogue, last_n=None) + + @staticmethod + def get_average(agent_scores: Dict[str, float]) -> float: + """Returns the average score for an agent's dialogues.""" + return mean(agent_scores.values()) if agent_scores else 0.0 + + @staticmethod + def get_stdev(agent_scores: Dict[str, float]) -> float: + """Returns the standard deviation of scores for an agent's dialogues.""" + if len(agent_scores) < 2: + return 0.0 + return stdev(agent_scores.values()) + + @staticmethod + def get_max(agent_scores: Dict[str, float]) -> float: + """Returns the maximum score for an agent's dialogues.""" + return max(agent_scores.values()) if agent_scores else 0.0 + + @staticmethod + def get_min(agent_scores: Dict[str, float]) -> float: + """Returns the minimum score for an agent's dialogues.""" + return min(agent_scores.values()) if agent_scores else 0.0 diff --git a/scripts/evaluation/utility_evaluation.py b/scripts/evaluation/utility_evaluation.py index ec898678..4106624c 100644 --- a/scripts/evaluation/utility_evaluation.py +++ b/scripts/evaluation/utility_evaluation.py @@ -16,11 +16,48 @@ import argparse import json +from collections import defaultdict +from typing import Dict from dialoguekit.utils.dialogue_reader import json_to_dialogues from scripts.evaluation.utility_metric import UtilityMetric +def get_summary( + scores: Dict[str, Dict[str, Dict[str, float]]], +) -> None: + """Displays a summary of the utility evaluation. + + Args: + scores: Agent_id -> conversation_id -> utility metrics dict. + """ + summary: dict = defaultdict( + lambda: {"total_dialogues": 0, "success_rate": 0, "srrr": 0, "rdl": 0} + ) + for agent_id, agent_scores in scores.items(): + for conv_metrics in agent_scores.values(): + summary[agent_id]["total_dialogues"] += 1 + summary[agent_id]["success_rate"] += conv_metrics["success"] + summary[agent_id]["srrr"] += conv_metrics[ + "successful_recommendation_round_ratio" + ] + summary[agent_id]["rdl"] += conv_metrics[ + "reward_per_dialogue_length" + ] + + for agent_id, stats in summary.items(): + total = stats["total_dialogues"] + print(f"Agent: {agent_id}") + print(f"\tTotal Dialogues: {total}") + print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}") + print( + "\tSuccessful Recommendation Round Ratio: " + f"{stats['srrr'] / total:.4f}" + ) + print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}") + print() + + def parse_args() -> argparse.Namespace: """Parses command-line arguments. @@ -75,7 +112,7 @@ def parse_args() -> argparse.Namespace: dialogues = json_to_dialogues(args.annotated_dialogues) metric = UtilityMetric(args.user_nlu_config, args.agent_nlu_config) - dialogues = metric.compute( + scores = metric.evaluate_agents( dialogues, recommendation_intent_labels=args.recommendation_intent_labels, acceptance_intent_labels=args.accept_intent_labels, @@ -84,8 +121,6 @@ def parse_args() -> argparse.Namespace: if args.output: with open(args.output, "w") as f: - json.dump( - [dialogue.to_dict() for dialogue in dialogues], f, indent=2 - ) + json.dump(scores, f, indent=2) - metric.get_summary(dialogues) + get_summary(scores) diff --git a/scripts/evaluation/utility_metric.py b/scripts/evaluation/utility_metric.py index f59d92e6..cf17604e 100644 --- a/scripts/evaluation/utility_metric.py +++ b/scripts/evaluation/utility_metric.py @@ -1,10 +1,9 @@ """Utility metric class implementation. -Encapsulates the logic from `utility_evaluation.py` into a `BaseMetric`. +Encapsulates the logic from `utility_evaluation.py` into a `Metric`. """ -from collections import defaultdict -from typing import Dict, List, Tuple +from typing import Any, Dict, List, Optional, Tuple, cast from confuse import Configuration @@ -14,297 +13,284 @@ from dialoguekit.nlu.nlu import NLU from dialoguekit.participant.participant import DialogueParticipant from usersimcrs.utils.simulation_utils import get_NLU -from scripts.evaluation.base_metric import BaseMetric +from scripts.evaluation.base_metric import Metric -def annotate_dialogue( - dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU -) -> Dialogue: - """Annotates utterances with dialogue acts. - - Args: - dialogue: Dialogue to be annotated. - user_nlu: User NLU module. - agent_nlu: Agent NLU module. - - Returns: - Annotated dialogue. - """ - for i, utterance in enumerate(dialogue.utterances): - if not isinstance(utterance, AnnotatedUtterance): - dialogue.utterances[i] = AnnotatedUtterance.from_utterance( - utterance - ) - - if len(utterance.dialogue_acts) > 0: - continue - - if utterance.participant == DialogueParticipant.USER: - dialogue.utterances[ - i - ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance) - elif utterance.participant == DialogueParticipant.AGENT: - dialogue.utterances[ - i - ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance) - else: - raise ValueError(f"Unknown participant: {utterance.participant}") - return dialogue - - -def annotate_dialogues( - dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU -) -> List[Dialogue]: - """Annotates dialogues with dialogue acts. - - Args: - dialogues: Dialogues. - user_nlu: User NLU module. - agent_nlu: Agent NLU module. - - Returns: - Annotated dialogues. - """ - # TODO: Move this to DialogueKit - # See: https://github.com/iai-group/UserSimCRS/issues/219 - return [ - annotate_dialogue(dialogue, user_nlu, agent_nlu) - for dialogue in dialogues - ] - - -def _get_recommendation_rounds( - dialogue: Dialogue, recommendation_intents: List[Intent] -) -> List[List[AnnotatedUtterance]]: - rounds: List[List[AnnotatedUtterance]] = [] - current_round: List[AnnotatedUtterance] = [] - for utterance in dialogue.utterances: - if any( - intent in utterance.get_intents() - for intent in recommendation_intents - ): - if current_round: - rounds.append(current_round) - current_round = [utterance] - else: - current_round.append(utterance) - return rounds - - -def _is_recommendation_accepted( - round: List[AnnotatedUtterance], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], -) -> bool: - b_accepted = False - for utterance in round: - if utterance.participant == DialogueParticipant.USER: - intents = utterance.get_intents() - if any(intent in acceptance_intents for intent in intents): - b_accepted = True - elif any(intent in rejection_intents for intent in intents): - return False - return b_accepted - - -def assess_dialogue( - dialogue: Dialogue, - recommendation_intents: List[Intent], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], -) -> Tuple[int, int, int]: - """Assesses the utility of the dialogue. - - Args: - dialogue: Dialogue. - recommendation_intents: Intents corresponding to recommendation. - acceptance_intents: Intents corresponding to acceptance. - rejection_intents: Intents corresponding to rejection. - - Returns: - Tuple of number of accepted recommendations, successful recommendation - rounds and total recommendation rounds. - """ - # TODO: Optimize overall assessment to avoid multiple iterations over - # utterances. - rounds = _get_recommendation_rounds(dialogue, recommendation_intents) - successful_rounds = 0 - for round in rounds: - if _is_recommendation_accepted( - round, acceptance_intents, rejection_intents - ): - successful_rounds += 1 - - nb_accepted_recommendations = sum( - 1 - for utterance in dialogue.utterances - if utterance.participant == DialogueParticipant.USER - and any( - intent in acceptance_intents for intent in utterance.get_intents() - ) - ) - return nb_accepted_recommendations, successful_rounds, len(rounds) - - -class UtilityMetric(BaseMetric): +class UtilityMetric(Metric): """Computes utility metrics for dialogues. Constructor takes paths to user and agent NLU configuration files. """ - def __init__(self, user_nlu_config_path: str, agent_nlu_config_path: str): - super().__init__() + def __init__( + self, + user_nlu_config_path: str, + agent_nlu_config_path: str, + name: str = "utility", + ): + super().__init__(name) self.user_nlu_config_path = user_nlu_config_path self.agent_nlu_config_path = agent_nlu_config_path + self._user_nlu: Optional[NLU] = None + self._agent_nlu: Optional[NLU] = None + + def _annotate_dialogue( + self, dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU + ) -> Dialogue: + """Annotates utterances with dialogue acts. + + Args: + dialogue: Dialogue to be annotated. + user_nlu: User NLU module. + agent_nlu: Agent NLU module. + + Returns: + Annotated dialogue. + """ + for i, utterance in enumerate(dialogue.utterances): + if not isinstance(utterance, AnnotatedUtterance): + dialogue.utterances[i] = AnnotatedUtterance.from_utterance( + utterance + ) - @property - def name(self) -> str: - return "utility" - - def _load_nlus(self) -> Tuple[NLU, NLU]: - user_nlu_config = Configuration("User NLU Configuration") - user_nlu_config.set_file(self.user_nlu_config_path) - user_nlu = get_NLU(user_nlu_config) - - agent_nlu_config = Configuration("Agent NLU Configuration") - agent_nlu_config.set_file(self.agent_nlu_config_path) - agent_nlu = get_NLU(agent_nlu_config) - - return user_nlu, agent_nlu + if len(utterance.dialogue_acts) > 0: + continue + + if utterance.participant == DialogueParticipant.USER: + dialogue.utterances[ + i + ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance) + elif utterance.participant == DialogueParticipant.AGENT: + dialogue.utterances[ + i + ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance) + else: + raise ValueError( + f"Unknown participant: {utterance.participant}" + ) + return dialogue - def compute( - self, - dialogues: List[Dialogue], - recommendation_intent_labels: List[str] = ["REC-S", "REC-E"], - acceptance_intent_labels: List[str] = ["ACC"], - rejection_intent_labels: List[str] = ["REJ"], + def _annotate_dialogues( + self, dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU ) -> List[Dialogue]: - user_nlu, agent_nlu = self._load_nlus() - - dialogues = annotate_dialogues(dialogues, user_nlu, agent_nlu) - - recommendation_intents = [ - Intent(label) for label in recommendation_intent_labels + """Annotates dialogues with dialogue acts. + + Args: + dialogues: Dialogues. + user_nlu: User NLU module. + agent_nlu: Agent NLU module. + + Returns: + Annotated dialogues. + """ + # TODO: Move this to DialogueKit + # See: https://github.com/iai-group/UserSimCRS/issues/219 + return [ + self._annotate_dialogue(dialogue, user_nlu, agent_nlu) + for dialogue in dialogues ] - acceptance_intents = [ - Intent(label) for label in acceptance_intent_labels - ] - rejection_intents = [Intent(label) for label in rejection_intent_labels] - - for dialogue in dialogues: - ( - nb_accepted_recommendations, - successful_rounds, - total_rounds, - ) = assess_dialogue( - dialogue, - recommendation_intents, - acceptance_intents, - rejection_intents, - ) - dialogue.metadata["utility"] = { - "success": int(successful_rounds > 0), - "successful_recommendation_round_ratio": ( - successful_rounds / total_rounds - if total_rounds > 0 - else 0.0 - ), - "reward_per_dialogue_length": ( - nb_accepted_recommendations / len(dialogue.utterances) - if len(dialogue.utterances) > 0 - else 0.0 - ), - } - - return dialogues - - def get_summary(self, dialogues: List[Dialogue]) -> None: - summary: Dict[str, Dict[str, float]] = defaultdict( - lambda: { - "total_dialogues": 0, - "success_rate": 0, - "srrr": 0, - "rdl": 0, - } - ) - for dialogue in dialogues: - summary[dialogue.agent_id]["total_dialogues"] += 1 - summary[dialogue.agent_id]["success_rate"] += dialogue.metadata[ - "utility" - ]["success"] - summary[dialogue.agent_id]["srrr"] += dialogue.metadata["utility"][ - "successful_recommendation_round_ratio" - ] - summary[dialogue.agent_id]["rdl"] += dialogue.metadata["utility"][ - "reward_per_dialogue_length" - ] - - for agent_id, stats in summary.items(): - total = stats["total_dialogues"] - print(f"Agent: {agent_id}") - print(f"\tTotal Dialogues: {total}") - print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}") - print( - "\tSuccessful Recommendation Round Ratio: " - f"{stats['srrr'] / total:.4f}" - ) - print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}") - print() - - -class UtilitySuccessMetric(UtilityMetric): - """Extracts per-dialogue success flag from utility analysis.""" - @property - def name(self) -> str: - return "utility.success" - - def compute(self, dialogues: List[Dialogue], *args, **kwargs): - dialogues = super().compute(dialogues, *args, **kwargs) - - results: Dict[str, Dict[int, int]] = defaultdict(dict) - for i, dialogue in enumerate(dialogues): - results[dialogue.agent_id][i] = int( - dialogue.metadata.get("utility", {}).get("success", 0) + def _get_recommendation_rounds( + self, dialogue: Dialogue, recommendation_intents: List[Intent] + ) -> List[List[AnnotatedUtterance]]: + """Gets utterances per recommendation round. + + Args: + dialogue: Dialogue. + recommendation_intents: Intents corresponding to recommendation. + + Returns: + Utterances per recommendation round. + """ + rounds: List[List[AnnotatedUtterance]] = [] + current_round: List[AnnotatedUtterance] = [] + for utterance in dialogue.utterances: + if any( + intent in utterance.get_intents() + for intent in recommendation_intents + ): + if current_round: + rounds.append(current_round) + current_round = [utterance] + else: + current_round.append(utterance) + return rounds + + def _is_recommendation_accepted( + self, + round: List[AnnotatedUtterance], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], + ) -> bool: + """Assesses whether the recommendation was accepted. + + Args: + round: Utterances in recommendation round. + acceptance_intents: Intents corresponding to acceptance. + rejection_intents: Intents corresponding to rejection. + + Returns: + True if the recommendation was accepted, False otherwise. + """ + b_accepted = False + for utterance in round: + if utterance.participant == DialogueParticipant.USER: + intents = utterance.get_intents() + if any(intent in acceptance_intents for intent in intents): + b_accepted = True + elif any(intent in rejection_intents for intent in intents): + return False + return b_accepted + + def _assess_dialogue( + self, + dialogue: Dialogue, + recommendation_intents: List[Intent], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], + ) -> Tuple[int, int, int]: + """Assesses the utility of the dialogue. + + Args: + dialogue: Dialogue. + recommendation_intents: Intents corresponding to recommendation. + acceptance_intents: Intents corresponding to acceptance. + rejection_intents: Intents corresponding to rejection. + + Returns: + Tuple of number of accepted recommendations, successful + recommendation rounds and total recommendation rounds. + """ + # TODO: Optimize overall assessment to avoid multiple iterations over + # utterances. + rounds = self._get_recommendation_rounds( + dialogue, recommendation_intents + ) + successful_rounds = 0 + for round in rounds: + if self._is_recommendation_accepted( + round, acceptance_intents, rejection_intents + ): + successful_rounds += 1 + + nb_accepted_recommendations = sum( + 1 + for utterance in dialogue.utterances + if utterance.participant == DialogueParticipant.USER + and any( + intent in acceptance_intents + for intent in utterance.get_intents() ) - return results - - -class UtilitySRRRMetric(UtilityMetric): - """Extracts successful recommendation round ratio per dialogue.""" - - @property - def name(self) -> str: - return "utility.successful_recommendation_round_ratio" + ) + return nb_accepted_recommendations, successful_rounds, len(rounds) - def compute(self, dialogues: List[Dialogue], *args, **kwargs): - dialogues = super().compute(dialogues, *args, **kwargs) + def _load_nlus(self) -> Tuple[NLU, NLU]: + """Returns (cached) user and agent NLU modules.""" + if self._user_nlu is None: + # NLU module for user utterances + user_nlu_config = Configuration("User NLU Configuration") + user_nlu_config.set_file(self.user_nlu_config_path) + self._user_nlu = get_NLU(user_nlu_config) + if self._agent_nlu is None: + # NLU module for agent utterances + agent_nlu_config = Configuration("Agent NLU Configuration") + agent_nlu_config.set_file(self.agent_nlu_config_path) + self._agent_nlu = get_NLU(agent_nlu_config) + return self._user_nlu, self._agent_nlu + + def _get_intent_lists(self, **kwargs: Any) -> Tuple[List[Intent], ...]: + """Builds intent lists from kwargs.""" + rec_labels = kwargs.get( + "recommendation_intent_labels", ["REC-S", "REC-E"] + ) + acc_labels = kwargs.get("acceptance_intent_labels", ["ACC"]) + rej_labels = kwargs.get("rejection_intent_labels", ["REJ"]) + return ( + [Intent(label) for label in rec_labels], + [Intent(label) for label in acc_labels], + [Intent(label) for label in rej_labels], + ) - results: Dict[str, Dict[int, float]] = defaultdict(dict) - for i, dialogue in enumerate(dialogues): - results[dialogue.agent_id][i] = float( - dialogue.metadata.get("utility", {}).get( - "successful_recommendation_round_ratio", 0.0 - ) + def evaluate_dialogues( + self, dialogues: List[Dialogue], **kwargs: Any + ) -> Dict[str, Dict[str, float]]: + """Computes all utility metrics for every dialogue. + + Overrides base to return full metrics dict per dialogue rather than + a single float, since utility evaluation aggregates SR, SRRR, and RDL. + + Returns: + conversation_id -> metrics dict with keys: success, + successful_recommendation_round_ratio, reward_per_dialogue_length. + """ + return { + dialogue.conversation_id: self._get_utility_metrics( + dialogue, **kwargs ) - return results - - -class UtilityRDLMetric(UtilityMetric): - """Extracts reward-per-dialogue-length per dialogue.""" - - @property - def name(self) -> str: - return "utility.reward_per_dialogue_length" - - def compute(self, dialogues: List[Dialogue], *args, **kwargs): - dialogues = super().compute(dialogues, *args, **kwargs) - - results: Dict[str, Dict[int, float]] = defaultdict(dict) - for i, dialogue in enumerate(dialogues): - results[dialogue.agent_id][i] = float( - dialogue.metadata.get("utility", {}).get( - "reward_per_dialogue_length", 0.0 - ) + for dialogue in dialogues + } + + def evaluate_agents( + self, dialogues: List[Dialogue], **kwargs: Any + ) -> Dict[str, Dict[str, Dict[str, float]]]: + """Computes utility metrics per agent, returning full metrics per + dialogue. + + Returns: + agent_id -> conversation_id -> metrics dict (success, srrr, rdl). + """ + result = super().evaluate_agents(dialogues, **kwargs) + return cast(Dict[str, Dict[str, Dict[str, float]]], result) + + def _get_utility_metrics( + self, dialogue: Dialogue, **kwargs: Any + ) -> Dict[str, float]: + """Returns full utility dict for one dialogue.""" + user_nlu, agent_nlu = self._load_nlus() + self._annotate_dialogue(dialogue, user_nlu, agent_nlu) + ( + recommendation_intents, + acceptance_intents, + rejection_intents, + ) = self._get_intent_lists(**kwargs) + ( + nb_accepted_recommendations, + successful_rounds, + total_rounds, + ) = self._assess_dialogue( + dialogue, + recommendation_intents, + acceptance_intents, + rejection_intents, + ) + return { + "success": float(successful_rounds > 0), + "successful_recommendation_round_ratio": ( + successful_rounds / total_rounds if total_rounds > 0 else 0.0 + ), + "reward_per_dialogue_length": ( + nb_accepted_recommendations / len(dialogue.utterances) + if dialogue.utterances + else 0.0 + ), + } + + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes one utility metric for a single dialogue. + + Args: + dialogue: Dialogue to evaluate. + metric: One of "success", "successful_recommendation_round_ratio", + "reward_per_dialogue_length". Default "success". + + Returns: + The selected metric value as float. + """ + metrics = self._get_utility_metrics(dialogue, **kwargs) + metric = kwargs.get("metric", "success") + if metric not in metrics: + raise ValueError( + f"Unknown metric '{metric}'. " + f"Expected one of {list(metrics.keys())}" ) - return results + return metrics[metric] diff --git a/tests/evaluation/test_quality_metric.py b/tests/evaluation/test_quality_metric.py new file mode 100644 index 00000000..2c409806 --- /dev/null +++ b/tests/evaluation/test_quality_metric.py @@ -0,0 +1,83 @@ +"""Tests for QualityMetric.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from dialoguekit.utils.dialogue_reader import json_to_dialogues + +from scripts.evaluation.quality_metric import QualityMetric + + +@pytest.fixture +def dialogues(): + """Load test dialogues.""" + return json_to_dialogues( + "tests/data/annotated_dialogues.json", + agent_ids=["Agent"], + user_ids=["User"], + ) + + +@pytest.fixture +def mock_ollama(): + """Mock Ollama LLM interface that returns fixed score JSON.""" + interface = MagicMock() + interface.get_llm_api_response.return_value = ( + '{"score": 4, "score_explanation": "good"}' + ) + return interface + + +@pytest.fixture +def metric(mock_ollama): + """QualityMetric with mocked Ollama interface.""" + with patch.object( + QualityMetric, "_get_ollama_interface", return_value=mock_ollama + ): + yield QualityMetric(ollama_config_path="dummy_config.json") + + +def test_evaluate_dialogue( + metric: QualityMetric, mock_ollama, dialogues +) -> None: + """Test evaluate_dialogue returns mean of aspect scores for a dialogue.""" + dialogue = dialogues[0] + score = metric.evaluate_dialogue(dialogue) + assert score == 4.0 + assert mock_ollama.get_llm_api_response.call_count == len(metric.rubrics) + + +def test_evaluate_dialogue_with_aspects( + metric: QualityMetric, mock_ollama, dialogues +) -> None: + """Test evaluate_dialogue with aspects kwarg calls LLM only for aspects.""" + dialogue = dialogues[0] + aspects = ["REC_RELEVANCE", "FLUENCY"] + score = metric.evaluate_dialogue(dialogue, aspects=aspects) + assert score == 4.0 + assert mock_ollama.get_llm_api_response.call_count == 2 + + +def test_evaluate_dialogues( + metric: QualityMetric, mock_ollama, dialogues +) -> None: + """Test evaluate_dialogues returns conversation_id -> score.""" + result = metric.evaluate_dialogues(dialogues) + assert len(result) == len(dialogues) + for dialogue in dialogues: + assert dialogue.conversation_id in result + assert result[dialogue.conversation_id] == 4.0 + expected_calls = len(dialogues) * len(metric.rubrics) + assert mock_ollama.get_llm_api_response.call_count == expected_calls + + +def test_evaluate_agents(metric: QualityMetric, dialogues) -> None: + """Test evaluate_agents returns agent_id -> {conversation_id -> score}.""" + result = metric.evaluate_agents(dialogues) + assert "Agent" in result + agent_scores = result["Agent"] + assert len(agent_scores) == len(dialogues) + for dialogue in dialogues: + assert dialogue.conversation_id in agent_scores + assert agent_scores[dialogue.conversation_id] == 4.0 diff --git a/tests/evaluation/test_satisfaction_metric.py b/tests/evaluation/test_satisfaction_metric.py new file mode 100644 index 00000000..71231505 --- /dev/null +++ b/tests/evaluation/test_satisfaction_metric.py @@ -0,0 +1,68 @@ +"""Tests for SatisfactionMetric.""" + +from unittest.mock import MagicMock + +import pytest + +from dialoguekit.utils.dialogue_reader import json_to_dialogues + +from scripts.evaluation.satisfaction_metric import SatisfactionMetric + + +@pytest.fixture +def dialogues(): + """Load test dialogues.""" + return json_to_dialogues( + "tests/data/annotated_dialogues.json", + agent_ids=["Agent"], + user_ids=["User"], + ) + + +@pytest.fixture +def mock_classifier(): + """Mock satisfaction classifier that returns fixed scores.""" + classifier = MagicMock() + classifier.classify_last_n_dialogue = MagicMock(return_value=3.5) + return classifier + + +@pytest.fixture +def metric(mock_classifier): + """SatisfactionMetric with mocked classifier.""" + return SatisfactionMetric(classifier=mock_classifier) + + +def test_evaluate_dialogue( + metric: SatisfactionMetric, mock_classifier, dialogues +) -> None: + """Test evaluate_dialogue returns classifier score for a single dialogue.""" + dialogue = dialogues[0] + score = metric.evaluate_dialogue(dialogue) + assert score == 3.5 + mock_classifier.classify_last_n_dialogue.assert_called_once_with( + dialogue, last_n=None + ) + + +def test_evaluate_dialogues( + metric: SatisfactionMetric, mock_classifier, dialogues +) -> None: + """Test evaluate_dialogues returns conversation_id -> score.""" + result = metric.evaluate_dialogues(dialogues) + assert len(result) == len(dialogues) + for dialogue in dialogues: + assert dialogue.conversation_id in result + assert result[dialogue.conversation_id] == 3.5 + assert mock_classifier.classify_last_n_dialogue.call_count == len(dialogues) + + +def test_evaluate_agents(metric: SatisfactionMetric, dialogues) -> None: + """Test evaluate_agents returns agent_id -> {conversation_id -> score}.""" + result = metric.evaluate_agents(dialogues) + assert "Agent" in result + agent_scores = result["Agent"] + assert len(agent_scores) == len(dialogues) + for dialogue in dialogues: + assert dialogue.conversation_id in agent_scores + assert agent_scores[dialogue.conversation_id] == 3.5 diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py new file mode 100644 index 00000000..e12b13dc --- /dev/null +++ b/tests/evaluation/test_utility_metric.py @@ -0,0 +1,77 @@ +"""Tests for UtilityMetric.""" + +from unittest.mock import patch + +import pytest + +from dialoguekit.utils.dialogue_reader import json_to_dialogues + +from scripts.evaluation.utility_metric import UtilityMetric + + +@pytest.fixture +def dialogues(): + """Load test dialogues.""" + return json_to_dialogues( + "tests/data/annotated_dialogues.json", + agent_ids=["Agent"], + user_ids=["User"], + ) + + +FIXED_UTILITY = { + "success": 1.0, + "successful_recommendation_round_ratio": 0.5, + "reward_per_dialogue_length": 0.1, +} + + +@pytest.fixture +def metric(dialogues): + """UtilityMetric returning fixed metrics.""" + with patch.object( + UtilityMetric, "_get_utility_metrics", return_value=FIXED_UTILITY + ): + yield UtilityMetric( + user_nlu_config_path="dummy_user_nlu.yaml", + agent_nlu_config_path="dummy_agent_nlu.yaml", + ) + + +def test_evaluate_dialogue(metric: UtilityMetric, dialogues) -> None: + """Test evaluate_dialogue returns selected metric as float.""" + dialogue = dialogues[0] + assert metric.evaluate_dialogue(dialogue) == 1.0 + assert metric.evaluate_dialogue(dialogue, metric="success") == 1.0 + assert ( + metric.evaluate_dialogue( + dialogue, metric="successful_recommendation_round_ratio" + ) + == 0.5 + ) + assert ( + metric.evaluate_dialogue(dialogue, metric="reward_per_dialogue_length") + == 0.1 + ) + + +def test_evaluate_dialogues(metric: UtilityMetric, dialogues) -> None: + """Test evaluate_dialogues returns conversation_id -> full metrics dict.""" + result = metric.evaluate_dialogues(dialogues) + assert len(result) == len(dialogues) + for dialogue in dialogues: + assert dialogue.conversation_id in result + assert result[dialogue.conversation_id] == FIXED_UTILITY + + +def test_evaluate_agents(metric: UtilityMetric, dialogues) -> None: + """Test evaluate_agents returns agent_id -> {conversation_id -> metrics + dict}.""" + result = metric.evaluate_agents(dialogues) + assert "Agent" in result + agent_scores = result["Agent"] + assert len(agent_scores) == len(dialogues) + for dialogue in dialogues: + assert dialogue.conversation_id in agent_scores + conv_metrics = agent_scores[dialogue.conversation_id] + assert conv_metrics == FIXED_UTILITY From 800f8c0c22d307506dea8fcecbb8d16a3f6a5ec1 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 11:37:04 +0100 Subject: [PATCH 10/38] improvement/232-create-abstract-class-for-metric fixes --- scripts/evaluation/base_metric.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py index f08a3a05..e43e9fc6 100644 --- a/scripts/evaluation/base_metric.py +++ b/scripts/evaluation/base_metric.py @@ -6,7 +6,7 @@ from dialoguekit.core.dialogue import Dialogue -class Metric(ABC): +class BaseMetric(ABC): def __init__(self, name: str) -> None: """Initializes the metric. @@ -41,7 +41,7 @@ def evaluate_dialogues( **kwargs: Additional arguments specific to the metric. Returns: - Dictionary with result per dialogue. + Dictionary with result per dialogue. Keys are conversation IDs. """ return { dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs) @@ -58,7 +58,8 @@ def evaluate_agents( **kwargs: Additional arguments specific to the metric. Returns: - Dictionary with result per agent. + Dictionary with result per agent. Outer keys are agent IDs; + inner dict keys are conversation IDs. """ dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list) for dialogue in dialogues: From 2014e222c5ef3e43ffa54082fdbd9cb245f2a15a Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 11:45:32 +0100 Subject: [PATCH 11/38] improvement/232-create-abstract-class-for-metric fixes --- scripts/evaluation/base_metric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py index e43e9fc6..9141972b 100644 --- a/scripts/evaluation/base_metric.py +++ b/scripts/evaluation/base_metric.py @@ -58,8 +58,8 @@ def evaluate_agents( **kwargs: Additional arguments specific to the metric. Returns: - Dictionary with result per agent. Outer keys are agent IDs; - inner dict keys are conversation IDs. + Dictionary with result per agent. + Keys are agent IDs and conversation IDs. """ dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list) for dialogue in dialogues: From 363551c268a28acc5cb2235cc24bbc223a33fc78 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 12:16:34 +0100 Subject: [PATCH 12/38] improvement/232-create-abstract-class-for-metric remove agent evaluation --- scripts/evaluation/base_metric.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py index 9141972b..c99399a2 100644 --- a/scripts/evaluation/base_metric.py +++ b/scripts/evaluation/base_metric.py @@ -1,7 +1,6 @@ """Abstract base class for dialogue evaluation metrics.""" from abc import ABC, abstractmethod -from collections import defaultdict from typing import Any, Dict, List from dialoguekit.core.dialogue import Dialogue @@ -47,24 +46,3 @@ def evaluate_dialogues( dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs) for dialogue in dialogues } - - def evaluate_agents( - self, dialogues: List[Dialogue], **kwargs: Any - ) -> Dict[str, Dict[str, float]]: - """Computes the metric for every agent over their dialogues. - - Args: - dialogues: Dialogues. - **kwargs: Additional arguments specific to the metric. - - Returns: - Dictionary with result per agent. - Keys are agent IDs and conversation IDs. - """ - dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list) - for dialogue in dialogues: - dialogues_by_agent[dialogue.agent_id].append(dialogue) - return { - agent_id: self.evaluate_dialogues(agent_dialogues, **kwargs) - for agent_id, agent_dialogues in dialogues_by_agent.items() - } From a253f74dc053e0ea257443b8f8883a6f20dfb6bb Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 14:21:51 +0100 Subject: [PATCH 13/38] improvement/233-create-classes-for-metrics edit classes --- scripts/evaluation/base_metric.py | 25 +---- scripts/evaluation/quality_metric.py | 108 ++++++++++++------- scripts/evaluation/satisfaction_metric.py | 50 ++++----- scripts/evaluation/utility_metric.py | 90 +++++++++------- tests/evaluation/test_quality_metric.py | 39 ++----- tests/evaluation/test_satisfaction_metric.py | 28 +---- tests/evaluation/test_utility_metric.py | 26 +++-- 7 files changed, 175 insertions(+), 191 deletions(-) diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py index f08a3a05..c99399a2 100644 --- a/scripts/evaluation/base_metric.py +++ b/scripts/evaluation/base_metric.py @@ -1,12 +1,11 @@ """Abstract base class for dialogue evaluation metrics.""" from abc import ABC, abstractmethod -from collections import defaultdict from typing import Any, Dict, List from dialoguekit.core.dialogue import Dialogue -class Metric(ABC): +class BaseMetric(ABC): def __init__(self, name: str) -> None: """Initializes the metric. @@ -41,29 +40,9 @@ def evaluate_dialogues( **kwargs: Additional arguments specific to the metric. Returns: - Dictionary with result per dialogue. + Dictionary with result per dialogue. Keys are conversation IDs. """ return { dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs) for dialogue in dialogues } - - def evaluate_agents( - self, dialogues: List[Dialogue], **kwargs: Any - ) -> Dict[str, Dict[str, float]]: - """Computes the metric for every agent over their dialogues. - - Args: - dialogues: Dialogues. - **kwargs: Additional arguments specific to the metric. - - Returns: - Dictionary with result per agent. - """ - dialogues_by_agent: Dict[str, List[Dialogue]] = defaultdict(list) - for dialogue in dialogues: - dialogues_by_agent[dialogue.agent_id].append(dialogue) - return { - agent_id: self.evaluate_dialogues(agent_dialogues, **kwargs) - for agent_id, agent_dialogues in dialogues_by_agent.items() - } diff --git a/scripts/evaluation/quality_metric.py b/scripts/evaluation/quality_metric.py index 44b78ed5..a6e0475f 100644 --- a/scripts/evaluation/quality_metric.py +++ b/scripts/evaluation/quality_metric.py @@ -1,16 +1,27 @@ -"""Quality metric class implementation. +"""Script to evaluate dialogue quality using an LLM. -Extracted from the original CLI script in `quality_evaluation.py`. +The script evaluates dialogue quality with regards to five aspects: +- Recommendation relevance +- Communication style +- Fluency +- Conversational flow +- Overall satisfaction + +Each aspect is scored between 1 and 5, where the scores are described in a +dedicated rubric. The scoring is done using a large language model. """ +import argparse import json -from statistics import mean -from typing import Any, List, Optional +from typing import Any, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + pass from dialoguekit.core.dialogue import Dialogue from dialoguekit.participant.participant import DialogueParticipant -from scripts.evaluation.base_metric import Metric +from scripts.evaluation.base_metric import BaseMetric from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface @@ -28,27 +39,47 @@ ) -class QualityMetric(Metric): - """Quality evaluation metric using an LLM backend. - - Returns scores as floats (average across aspects per dialogue). - """ - +class QualityMetric(BaseMetric): def __init__( self, ollama_config_path: str, default_response: str = "", - rubrics: Optional[List[QualityRubrics]] = None, name: str = "quality", ) -> None: super().__init__(name) self.ollama_config_path = ollama_config_path self.default_response = default_response - self.rubrics = rubrics or list(QualityRubrics) self._ollama_interface: Optional[OllamaLLMInterface] = None + @staticmethod + def parse_args() -> argparse.Namespace: + """Parse command-line arguments. + + Returns: + Parsed arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--dialogues", + type=str, + required=True, + help="Path to the dialogues.", + ) + parser.add_argument( + "--ollama_config", + type=str, + required=True, + help="Path to the Ollama config file.", + ) + parser.add_argument( + "--output", + type=str, + help="(optional) Path to the output file.", + ) + return parser.parse_args() + def _get_ollama_interface(self) -> OllamaLLMInterface: - """Returns (cached) Ollama LLM interface.""" + """Returns Ollama LLM interface.""" if self._ollama_interface is None: self._ollama_interface = OllamaLLMInterface( self.ollama_config_path, @@ -81,28 +112,31 @@ def _get_prompt( prompt += _PROMPT_EVAL_OUTPUT_FORMAT return prompt - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: - """Returns average score across aspects for a single dialogue (1–5).""" - aspects = kwargs.get("aspects") - if aspects: - aspect_enums = [QualityRubrics[asp] for asp in aspects] - else: - aspect_enums = self.rubrics + def evaluate_dialogue( + self, dialogue: Dialogue, aspect: str, **kwargs: Any + ) -> float: + """Returns score for a single aspect of a dialogue. + + Args: + dialogue: Dialogue to evaluate. + aspect: Aspect to evaluate. Must be one of QualityRubrics enum names + + Returns: + Score (1-5) for the specified aspect. + Raises: + ValueError: When the LLM response cannot be parsed. + """ + aspect_enum = QualityRubrics[aspect] ollama_interface = self._get_ollama_interface() - scores: List[float] = [] - - for aspect in aspect_enums: - prompt = self._get_prompt(aspect, dialogue) - response = ollama_interface.get_llm_api_response(prompt) - try: - response = response.replace("\\", "\\\\") - response_dict = json.loads(response) - scores.append(int(response_dict["score"])) - except Exception: - print( - f"Failed to get score for {aspect} dialogue " - f"{dialogue.conversation_id}: {response}" - ) - - return mean(scores) if scores else 0.0 + prompt = self._get_prompt(aspect_enum, dialogue) + response = ollama_interface.get_llm_api_response(prompt) + try: + response = response.replace("\\", "\\\\") + response_dict = json.loads(response) + return float(response_dict["score"]) + except Exception as e: + raise ValueError( + f"Failed to get score for {aspect} dialogue " + f"{dialogue.conversation_id}: {response}" + ) from e diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py index e7125696..ac5915e0 100644 --- a/scripts/evaluation/satisfaction_metric.py +++ b/scripts/evaluation/satisfaction_metric.py @@ -1,22 +1,20 @@ """Satisfaction metric class implementation. -Wraps DialogueKit's satisfaction classifier into a `Metric` class. +Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class. """ -from statistics import mean, stdev -from typing import Any, Dict, Optional +from typing import Any, Optional -from dialoguekit.core.dialogue import Dialogue # type: ignore +from dialoguekit.core.dialogue import Dialogue from dialoguekit.nlu.models.satisfaction_classifier import ( SatisfactionClassifierSVM, ) +import argparse -from scripts.evaluation.base_metric import Metric +from scripts.evaluation.base_metric import BaseMetric -class SatisfactionMetric(Metric): - """Wraps the `SatisfactionClassifierSVM` to compute satisfaction scores.""" - +class SatisfactionMetric(BaseMetric): def __init__( self, classifier: Optional[SatisfactionClassifierSVM] = None, @@ -29,24 +27,18 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: """Computes the satisfaction score for a single dialogue.""" return self.classifier.classify_last_n_dialogue(dialogue, last_n=None) - @staticmethod - def get_average(agent_scores: Dict[str, float]) -> float: - """Returns the average score for an agent's dialogues.""" - return mean(agent_scores.values()) if agent_scores else 0.0 - - @staticmethod - def get_stdev(agent_scores: Dict[str, float]) -> float: - """Returns the standard deviation of scores for an agent's dialogues.""" - if len(agent_scores) < 2: - return 0.0 - return stdev(agent_scores.values()) - - @staticmethod - def get_max(agent_scores: Dict[str, float]) -> float: - """Returns the maximum score for an agent's dialogues.""" - return max(agent_scores.values()) if agent_scores else 0.0 - - @staticmethod - def get_min(agent_scores: Dict[str, float]) -> float: - """Returns the minimum score for an agent's dialogues.""" - return min(agent_scores.values()) if agent_scores else 0.0 + @classmethod + def parse_args(self) -> argparse.Namespace: + """Parses command-line arguments. + + Returns: + Parsed arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--dialogues", + type=str, + required=True, + help="Path to the dialogues.", + ) + return parser.parse_args() diff --git a/scripts/evaluation/utility_metric.py b/scripts/evaluation/utility_metric.py index cf17604e..88617683 100644 --- a/scripts/evaluation/utility_metric.py +++ b/scripts/evaluation/utility_metric.py @@ -1,11 +1,12 @@ """Utility metric class implementation. -Encapsulates the logic from `utility_evaluation.py` into a `Metric`. +Encapsulates the logic from `utility_evaluation.py` into a `BaseMetric`. """ -from typing import Any, Dict, List, Optional, Tuple, cast +from typing import Any, Dict, List, Optional, Tuple from confuse import Configuration +import argparse from dialoguekit.core.annotated_utterance import AnnotatedUtterance from dialoguekit.core.dialogue import Dialogue @@ -13,10 +14,10 @@ from dialoguekit.nlu.nlu import NLU from dialoguekit.participant.participant import DialogueParticipant from usersimcrs.utils.simulation_utils import get_NLU -from scripts.evaluation.base_metric import Metric +from scripts.evaluation.base_metric import BaseMetric -class UtilityMetric(Metric): +class UtilityMetric(BaseMetric): """Computes utility metrics for dialogues. Constructor takes paths to user and agent NLU configuration files. @@ -34,6 +35,54 @@ def __init__( self._user_nlu: Optional[NLU] = None self._agent_nlu: Optional[NLU] = None + @classmethod + def parse_args(self) -> argparse.Namespace: + """Parses command-line arguments. + + Returns: + Parsed command-line arguments. + """ + parser = argparse.ArgumentParser(prog="utility_evaluation.py") + parser.add_argument( + "annotated_dialogues", + type=str, + help="Annotated dialogues JSON file.", + ) + parser.add_argument( + "user_nlu_config", + type=str, + help="User NLU configuration file.", + ) + parser.add_argument( + "agent_nlu_config", + type=str, + help="Agent NLU configuration file.", + ) + parser.add_argument( + "--reject_intent_labels", + nargs="+", + default=["REJ"], + help="Intent labels corresponding to rejection.", + ) + parser.add_argument( + "--accept_intent_labels", + nargs="+", + default=["ACC"], + help="Intent labels corresponding to acceptance.", + ) + parser.add_argument( + "--recommendation_intent_labels", + nargs="+", + default=["REC-S", "REC-E"], + help="Intent labels corresponding to recommendation.", + ) + parser.add_argument( + "--output", + type=str, + help="Output file to save annotated dialogues with utility metrics", + ) + return parser.parse_args() + def _annotate_dialogue( self, dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU ) -> Dialogue: @@ -211,37 +260,6 @@ def _get_intent_lists(self, **kwargs: Any) -> Tuple[List[Intent], ...]: [Intent(label) for label in rej_labels], ) - def evaluate_dialogues( - self, dialogues: List[Dialogue], **kwargs: Any - ) -> Dict[str, Dict[str, float]]: - """Computes all utility metrics for every dialogue. - - Overrides base to return full metrics dict per dialogue rather than - a single float, since utility evaluation aggregates SR, SRRR, and RDL. - - Returns: - conversation_id -> metrics dict with keys: success, - successful_recommendation_round_ratio, reward_per_dialogue_length. - """ - return { - dialogue.conversation_id: self._get_utility_metrics( - dialogue, **kwargs - ) - for dialogue in dialogues - } - - def evaluate_agents( - self, dialogues: List[Dialogue], **kwargs: Any - ) -> Dict[str, Dict[str, Dict[str, float]]]: - """Computes utility metrics per agent, returning full metrics per - dialogue. - - Returns: - agent_id -> conversation_id -> metrics dict (success, srrr, rdl). - """ - result = super().evaluate_agents(dialogues, **kwargs) - return cast(Dict[str, Dict[str, Dict[str, float]]], result) - def _get_utility_metrics( self, dialogue: Dialogue, **kwargs: Any ) -> Dict[str, float]: @@ -276,7 +294,7 @@ def _get_utility_metrics( } def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: - """Computes one utility metric for a single dialogue. + """Computes one utility metric for a dialogue. Args: dialogue: Dialogue to evaluate. diff --git a/tests/evaluation/test_quality_metric.py b/tests/evaluation/test_quality_metric.py index 2c409806..d9882577 100644 --- a/tests/evaluation/test_quality_metric.py +++ b/tests/evaluation/test_quality_metric.py @@ -1,11 +1,8 @@ """Tests for QualityMetric.""" from unittest.mock import MagicMock, patch - import pytest - from dialoguekit.utils.dialogue_reader import json_to_dialogues - from scripts.evaluation.quality_metric import QualityMetric @@ -21,7 +18,7 @@ def dialogues(): @pytest.fixture def mock_ollama(): - """Mock Ollama LLM interface that returns fixed score JSON.""" + """Mock Ollama LLM interface.""" interface = MagicMock() interface.get_llm_api_response.return_value = ( '{"score": 4, "score_explanation": "good"}' @@ -31,7 +28,6 @@ def mock_ollama(): @pytest.fixture def metric(mock_ollama): - """QualityMetric with mocked Ollama interface.""" with patch.object( QualityMetric, "_get_ollama_interface", return_value=mock_ollama ): @@ -41,43 +37,30 @@ def metric(mock_ollama): def test_evaluate_dialogue( metric: QualityMetric, mock_ollama, dialogues ) -> None: - """Test evaluate_dialogue returns mean of aspect scores for a dialogue.""" + """Test evaluate_dialogue returns score for REC_RELEVANCE aspect.""" dialogue = dialogues[0] - score = metric.evaluate_dialogue(dialogue) + score = metric.evaluate_dialogue(dialogue, aspect="REC_RELEVANCE") assert score == 4.0 - assert mock_ollama.get_llm_api_response.call_count == len(metric.rubrics) + assert mock_ollama.get_llm_api_response.call_count == 1 -def test_evaluate_dialogue_with_aspects( +def test_evaluate_dialogue_different_aspect( metric: QualityMetric, mock_ollama, dialogues ) -> None: - """Test evaluate_dialogue with aspects kwarg calls LLM only for aspects.""" + """Test evaluate_dialogue with FLUENCY aspect.""" dialogue = dialogues[0] - aspects = ["REC_RELEVANCE", "FLUENCY"] - score = metric.evaluate_dialogue(dialogue, aspects=aspects) + score = metric.evaluate_dialogue(dialogue, aspect="FLUENCY") assert score == 4.0 - assert mock_ollama.get_llm_api_response.call_count == 2 + assert mock_ollama.get_llm_api_response.call_count == 1 def test_evaluate_dialogues( metric: QualityMetric, mock_ollama, dialogues ) -> None: - """Test evaluate_dialogues returns conversation_id -> score.""" - result = metric.evaluate_dialogues(dialogues) + """Test evaluate_dialogues with for COM_STYLE aspect.""" + result = metric.evaluate_dialogues(dialogues, aspect="COM_STYLE") assert len(result) == len(dialogues) for dialogue in dialogues: assert dialogue.conversation_id in result assert result[dialogue.conversation_id] == 4.0 - expected_calls = len(dialogues) * len(metric.rubrics) - assert mock_ollama.get_llm_api_response.call_count == expected_calls - - -def test_evaluate_agents(metric: QualityMetric, dialogues) -> None: - """Test evaluate_agents returns agent_id -> {conversation_id -> score}.""" - result = metric.evaluate_agents(dialogues) - assert "Agent" in result - agent_scores = result["Agent"] - assert len(agent_scores) == len(dialogues) - for dialogue in dialogues: - assert dialogue.conversation_id in agent_scores - assert agent_scores[dialogue.conversation_id] == 4.0 + assert mock_ollama.get_llm_api_response.call_count == len(dialogues) diff --git a/tests/evaluation/test_satisfaction_metric.py b/tests/evaluation/test_satisfaction_metric.py index 71231505..4d48c3dc 100644 --- a/tests/evaluation/test_satisfaction_metric.py +++ b/tests/evaluation/test_satisfaction_metric.py @@ -1,11 +1,8 @@ """Tests for SatisfactionMetric.""" from unittest.mock import MagicMock - import pytest - from dialoguekit.utils.dialogue_reader import json_to_dialogues - from scripts.evaluation.satisfaction_metric import SatisfactionMetric @@ -21,7 +18,7 @@ def dialogues(): @pytest.fixture def mock_classifier(): - """Mock satisfaction classifier that returns fixed scores.""" + """Mock satisfaction classifier.""" classifier = MagicMock() classifier.classify_last_n_dialogue = MagicMock(return_value=3.5) return classifier @@ -29,40 +26,23 @@ def mock_classifier(): @pytest.fixture def metric(mock_classifier): - """SatisfactionMetric with mocked classifier.""" return SatisfactionMetric(classifier=mock_classifier) -def test_evaluate_dialogue( - metric: SatisfactionMetric, mock_classifier, dialogues -) -> None: - """Test evaluate_dialogue returns classifier score for a single dialogue.""" +def test_evaluate_dialogue(metric: SatisfactionMetric, dialogues) -> None: + """Test evaluate_dialogue for a single dialogue.""" dialogue = dialogues[0] score = metric.evaluate_dialogue(dialogue) assert score == 3.5 - mock_classifier.classify_last_n_dialogue.assert_called_once_with( - dialogue, last_n=None - ) def test_evaluate_dialogues( metric: SatisfactionMetric, mock_classifier, dialogues ) -> None: - """Test evaluate_dialogues returns conversation_id -> score.""" + """Test evaluate_dialogues for list of dialogues.""" result = metric.evaluate_dialogues(dialogues) assert len(result) == len(dialogues) for dialogue in dialogues: assert dialogue.conversation_id in result assert result[dialogue.conversation_id] == 3.5 assert mock_classifier.classify_last_n_dialogue.call_count == len(dialogues) - - -def test_evaluate_agents(metric: SatisfactionMetric, dialogues) -> None: - """Test evaluate_agents returns agent_id -> {conversation_id -> score}.""" - result = metric.evaluate_agents(dialogues) - assert "Agent" in result - agent_scores = result["Agent"] - assert len(agent_scores) == len(dialogues) - for dialogue in dialogues: - assert dialogue.conversation_id in agent_scores - assert agent_scores[dialogue.conversation_id] == 3.5 diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py index e12b13dc..ec16d66c 100644 --- a/tests/evaluation/test_utility_metric.py +++ b/tests/evaluation/test_utility_metric.py @@ -27,7 +27,7 @@ def dialogues(): @pytest.fixture -def metric(dialogues): +def metric(): """UtilityMetric returning fixed metrics.""" with patch.object( UtilityMetric, "_get_utility_metrics", return_value=FIXED_UTILITY @@ -39,7 +39,7 @@ def metric(dialogues): def test_evaluate_dialogue(metric: UtilityMetric, dialogues) -> None: - """Test evaluate_dialogue returns selected metric as float.""" + """Test evaluate_dialogue returns selected metric.""" dialogue = dialogues[0] assert metric.evaluate_dialogue(dialogue) == 1.0 assert metric.evaluate_dialogue(dialogue, metric="success") == 1.0 @@ -56,22 +56,20 @@ def test_evaluate_dialogue(metric: UtilityMetric, dialogues) -> None: def test_evaluate_dialogues(metric: UtilityMetric, dialogues) -> None: - """Test evaluate_dialogues returns conversation_id -> full metrics dict.""" + """Test evaluate_dialogues returns conversation_id -> metric value.""" result = metric.evaluate_dialogues(dialogues) assert len(result) == len(dialogues) for dialogue in dialogues: assert dialogue.conversation_id in result - assert result[dialogue.conversation_id] == FIXED_UTILITY + assert result[dialogue.conversation_id] == 1.0 -def test_evaluate_agents(metric: UtilityMetric, dialogues) -> None: - """Test evaluate_agents returns agent_id -> {conversation_id -> metrics - dict}.""" - result = metric.evaluate_agents(dialogues) - assert "Agent" in result - agent_scores = result["Agent"] - assert len(agent_scores) == len(dialogues) +def test_evaluate_dialogues_with_specified_metric( + metric: UtilityMetric, dialogues +) -> None: + """Test evaluate_dialogues with specified metric.""" + result = metric.evaluate_dialogues( + dialogues, metric="successful_recommendation_round_ratio" + ) for dialogue in dialogues: - assert dialogue.conversation_id in agent_scores - conv_metrics = agent_scores[dialogue.conversation_id] - assert conv_metrics == FIXED_UTILITY + assert result[dialogue.conversation_id] == 0.5 From 777e00e32a64ae8496ca1abc488d5faacd6f09cc Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 14:24:48 +0100 Subject: [PATCH 14/38] improvement/233-create-classes-for-metrics remove old files --- scripts/evaluation/base_metric.py | 48 ------- scripts/evaluation/quality_evaluation.py | 72 ---------- scripts/evaluation/satisfaction_evaluation.py | 48 ------- scripts/evaluation/satisfaction_metric.py | 4 +- scripts/evaluation/utility_evaluation.py | 126 ------------------ 5 files changed, 2 insertions(+), 296 deletions(-) delete mode 100644 scripts/evaluation/base_metric.py delete mode 100644 scripts/evaluation/quality_evaluation.py delete mode 100644 scripts/evaluation/satisfaction_evaluation.py delete mode 100644 scripts/evaluation/utility_evaluation.py diff --git a/scripts/evaluation/base_metric.py b/scripts/evaluation/base_metric.py deleted file mode 100644 index c99399a2..00000000 --- a/scripts/evaluation/base_metric.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Abstract base class for dialogue evaluation metrics.""" - -from abc import ABC, abstractmethod -from typing import Any, Dict, List -from dialoguekit.core.dialogue import Dialogue - - -class BaseMetric(ABC): - def __init__(self, name: str) -> None: - """Initializes the metric. - - Args: - name: Metric name. - """ - self.name = name - - @abstractmethod - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: - """Computes the metric for a single dialogue. - - Args: - dialogue: Single dialogue to score. - **kwargs: Additional arguments specific to the metric. - - Raises: - NotImplementedError: When not implemented by a subclass. - - Returns: - Score for the dialogue. - """ - raise NotImplementedError() - - def evaluate_dialogues( - self, dialogues: List[Dialogue], **kwargs: Any - ) -> Dict[str, float]: - """Computes the metric for every dialogue in a given list. - - Args: - dialogues: Dialogues. - **kwargs: Additional arguments specific to the metric. - - Returns: - Dictionary with result per dialogue. Keys are conversation IDs. - """ - return { - dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs) - for dialogue in dialogues - } diff --git a/scripts/evaluation/quality_evaluation.py b/scripts/evaluation/quality_evaluation.py deleted file mode 100644 index 082adde3..00000000 --- a/scripts/evaluation/quality_evaluation.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Script to evaluate dialogue quality using an LLM. - -The script evaluates dialogue quality with regards to five aspects: -- Recommendation relevance -- Communication style -- Fluency -- Conversational flow -- Overall satisfaction - -Each aspect is scored between 1 and 5, where the scores are described in a -dedicated rubric. The scoring is done using a large language model. -""" - -import argparse -import json -import os -from statistics import mean, stdev - -from dialoguekit.utils.dialogue_reader import json_to_dialogues - -from scripts.evaluation.quality_metric import QualityMetric - - -def parse_args() -> argparse.Namespace: - """Parse command-line arguments. - - Returns: - Parsed arguments. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--dialogues", - type=str, - required=True, - help="Path to the dialogues.", - ) - parser.add_argument( - "--ollama_config", - type=str, - required=True, - help="Path to the Ollama config file.", - ) - parser.add_argument( - "--output", - type=str, - help="(optional) Path to the output file.", - ) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - - # Load dialogues - dialogues = json_to_dialogues(args.dialogues) - - metric = QualityMetric(args.ollama_config) - scores = metric.evaluate_agents(dialogues) - - # Save scores (agent_id -> conversation_id -> score) - if args.output: - os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) - with open(args.output, "w") as f: - json.dump(scores, f, indent=2) - - # Summary - for agent_id, agent_scores in scores.items(): - score_values = list(agent_scores.values()) - print(f"Scores for agent {agent_id}:") - avg_score = mean(score_values) - std_dev = stdev(score_values) if len(score_values) >= 2 else 0.0 - print(f"Average score: {avg_score:.2f} (std dev: {std_dev:.2f})") diff --git a/scripts/evaluation/satisfaction_evaluation.py b/scripts/evaluation/satisfaction_evaluation.py deleted file mode 100644 index 4c2d1890..00000000 --- a/scripts/evaluation/satisfaction_evaluation.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Automatic evaluation of dialogues. - -This script evaluates dialogues with regards to user satisfaction. It uses -DialogueKit's satisfaction classifier, which assigns a score between 1 and 5. -""" - -import argparse - -from dialoguekit.utils.dialogue_reader import json_to_dialogues -from scripts.evaluation.satisfaction_metric import SatisfactionMetric - - -def parse_args() -> argparse.Namespace: - """Parse command-line arguments. - - Returns: - Parsed arguments. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--dialogues", - type=str, - required=True, - help="Path to the dialogues.", - ) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - - # Load dialogues - dialogues = json_to_dialogues(args.dialogues) - print(f"Loaded {len(dialogues)} dialogues.") - - metric = SatisfactionMetric() - scores = metric.evaluate_agents(dialogues) - - # Summary - for agent, agent_scores in scores.items(): - avg_score = metric.get_average(agent_scores) - stdev_score = metric.get_stdev(agent_scores) - max_score = metric.get_max(agent_scores) - min_score = metric.get_min(agent_scores) - print(f"Agent: {agent} / Num. dialogues: {len(agent_scores)}") - print(f"Min score: {min_score}") - print(f"Max score: {max_score}") - print(f"Average score: {avg_score:.3f} (stdev: {stdev_score:.3f})") diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py index ac5915e0..e774ae11 100644 --- a/scripts/evaluation/satisfaction_metric.py +++ b/scripts/evaluation/satisfaction_metric.py @@ -27,8 +27,8 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: """Computes the satisfaction score for a single dialogue.""" return self.classifier.classify_last_n_dialogue(dialogue, last_n=None) - @classmethod - def parse_args(self) -> argparse.Namespace: + @staticmethod + def parse_args() -> argparse.Namespace: """Parses command-line arguments. Returns: diff --git a/scripts/evaluation/utility_evaluation.py b/scripts/evaluation/utility_evaluation.py deleted file mode 100644 index 4106624c..00000000 --- a/scripts/evaluation/utility_evaluation.py +++ /dev/null @@ -1,126 +0,0 @@ -"""Automatic evaluation of dialogues with regards to utility. - -The script computes three user-centric utility metrics proposed by Bernard and -Balog (2025): - -- Success Rate (SR) -- Successful Recommendation Round Ratio (SRRR) -- Reward-per-Dialogue-Length (RDL) - -Reference: -Bernard, Nolwenn, and Krisztian Balog. "Limitations of Current Evaluation -Practices for Conversational Recommender Systems and the Potential of User -Simulation." arXiv preprint arXiv:2510.05624 (2025). -https://arxiv.org/abs/2510.05624 -""" - -import argparse -import json -from collections import defaultdict -from typing import Dict - -from dialoguekit.utils.dialogue_reader import json_to_dialogues -from scripts.evaluation.utility_metric import UtilityMetric - - -def get_summary( - scores: Dict[str, Dict[str, Dict[str, float]]], -) -> None: - """Displays a summary of the utility evaluation. - - Args: - scores: Agent_id -> conversation_id -> utility metrics dict. - """ - summary: dict = defaultdict( - lambda: {"total_dialogues": 0, "success_rate": 0, "srrr": 0, "rdl": 0} - ) - for agent_id, agent_scores in scores.items(): - for conv_metrics in agent_scores.values(): - summary[agent_id]["total_dialogues"] += 1 - summary[agent_id]["success_rate"] += conv_metrics["success"] - summary[agent_id]["srrr"] += conv_metrics[ - "successful_recommendation_round_ratio" - ] - summary[agent_id]["rdl"] += conv_metrics[ - "reward_per_dialogue_length" - ] - - for agent_id, stats in summary.items(): - total = stats["total_dialogues"] - print(f"Agent: {agent_id}") - print(f"\tTotal Dialogues: {total}") - print(f"\tSuccess Rate: {stats['success_rate'] / total:.4f}") - print( - "\tSuccessful Recommendation Round Ratio: " - f"{stats['srrr'] / total:.4f}" - ) - print(f"\tReward-per-Dialogue-Length: {stats['rdl'] / total:.4f}") - print() - - -def parse_args() -> argparse.Namespace: - """Parses command-line arguments. - - Returns: - Parsed command-line arguments. - """ - parser = argparse.ArgumentParser(prog="utility_evaluation.py") - parser.add_argument( - "annotated_dialogues", - type=str, - help="Annotated dialogues JSON file.", - ) - parser.add_argument( - "user_nlu_config", - type=str, - help="User NLU configuration file.", - ) - parser.add_argument( - "agent_nlu_config", - type=str, - help="Agent NLU configuration file.", - ) - parser.add_argument( - "--reject_intent_labels", - nargs="+", - default=["REJ"], - help="Intent labels corresponding to rejection.", - ) - parser.add_argument( - "--accept_intent_labels", - nargs="+", - default=["ACC"], - help="Intent labels corresponding to acceptance.", - ) - parser.add_argument( - "--recommendation_intent_labels", - nargs="+", - default=["REC-S", "REC-E"], - help="Intent labels corresponding to recommendation.", - ) - parser.add_argument( - "--output", - type=str, - help="Output file to save annotated dialogues with utility metrics.", - ) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - - dialogues = json_to_dialogues(args.annotated_dialogues) - - metric = UtilityMetric(args.user_nlu_config, args.agent_nlu_config) - scores = metric.evaluate_agents( - dialogues, - recommendation_intent_labels=args.recommendation_intent_labels, - acceptance_intent_labels=args.accept_intent_labels, - rejection_intent_labels=args.reject_intent_labels, - ) - - if args.output: - with open(args.output, "w") as f: - json.dump(scores, f, indent=2) - - get_summary(scores) From 5a55b43be7a33958abd42098b7da5c8eb29e1c6c Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 14:33:54 +0100 Subject: [PATCH 15/38] improvement/233-create-classes-for-metrics remove extra files --- scripts/__init__.py | 1 - scripts/evaluation/__init__.py | 3 --- 2 files changed, 4 deletions(-) delete mode 100644 scripts/__init__.py delete mode 100644 scripts/evaluation/__init__.py diff --git a/scripts/__init__.py b/scripts/__init__.py deleted file mode 100644 index 5100bd2d..00000000 --- a/scripts/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Scripts package marker to avoid namespace package ambiguity for mypy.""" diff --git a/scripts/evaluation/__init__.py b/scripts/evaluation/__init__.py deleted file mode 100644 index ad40101c..00000000 --- a/scripts/evaluation/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Evaluation helpers package to make imports explicit for type checking.""" - -__all__: list[str] = [] From c5bb1002177fad0fab58b4956f246964f482fae0 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 14:38:46 +0100 Subject: [PATCH 16/38] improvement/233-create-classes-for-metrics remove changes --- usersimcrs/nlu/llm/__init__.py | 12 ++++++------ usersimcrs/utils/simulation_utils.py | 9 +++------ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/usersimcrs/nlu/llm/__init__.py b/usersimcrs/nlu/llm/__init__.py index 3c608547..be592d99 100644 --- a/usersimcrs/nlu/llm/__init__.py +++ b/usersimcrs/nlu/llm/__init__.py @@ -1,9 +1,9 @@ """Module level init for LLM-based NLU components.""" -"""Module level init for LLM-based NLU components. +from usersimcrs.nlu.llm.llm_dialogue_act_extractor import ( + LLMDialogueActsExtractor, +) -Avoid importing heavy submodules at package import time to keep test -collection lightweight; import submodules explicitly when needed. -""" - -__all__ = ["LLMDialogueActsExtractor"] +__all__ = [ + "LLMDialogueActsExtractor", +] diff --git a/usersimcrs/utils/simulation_utils.py b/usersimcrs/utils/simulation_utils.py index b0ed0c9f..6121723e 100644 --- a/usersimcrs/utils/simulation_utils.py +++ b/usersimcrs/utils/simulation_utils.py @@ -142,12 +142,9 @@ def _get_agenda_based_simulator_config( ratings = Ratings(item_collection) ratings.load_ratings_csv(file_path=config["ratings"].get()) - raw = config["historical_ratings_ratio"].get() - if raw is None: - historical_ratio = 0.8 - else: - historical_ratio = float(raw) - historical_ratings, _ = ratings.create_split(historical_ratio) + historical_ratings, _ = ratings.create_split( + config["historical_ratings_ratio"].get(0.8) + ) preference_model = SimplePreferenceModel( domain, From a1fac9f593f508df97d8f20cea900c5f8878c081 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 14:59:49 +0100 Subject: [PATCH 17/38] improvement/233-create-classes-for-metrics remove changes --- scripts/evaluation/quality_metric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/evaluation/quality_metric.py b/scripts/evaluation/quality_metric.py index a6e0475f..5a2323fd 100644 --- a/scripts/evaluation/quality_metric.py +++ b/scripts/evaluation/quality_metric.py @@ -135,8 +135,8 @@ def evaluate_dialogue( response = response.replace("\\", "\\\\") response_dict = json.loads(response) return float(response_dict["score"]) - except Exception as e: + except Exception: raise ValueError( f"Failed to get score for {aspect} dialogue " f"{dialogue.conversation_id}: {response}" - ) from e + ) From 4ed5727280def14d42ddd615d3196d7e6d872879 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 15:32:21 +0100 Subject: [PATCH 18/38] move file --- scripts/evaluation/satisfaction_metric.py | 44 +++++++++++++++++++ usersimcrs/evaluation/__init__.py | 5 +++ .../evaluation/base_metric.py | 0 3 files changed, 49 insertions(+) create mode 100644 scripts/evaluation/satisfaction_metric.py create mode 100644 usersimcrs/evaluation/__init__.py rename {scripts => usersimcrs}/evaluation/base_metric.py (100%) diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py new file mode 100644 index 00000000..78bb6754 --- /dev/null +++ b/scripts/evaluation/satisfaction_metric.py @@ -0,0 +1,44 @@ +"""Satisfaction metric class implementation. + +Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class. +""" + +from typing import Any, Optional + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.nlu.models.satisfaction_classifier import ( + SatisfactionClassifierSVM, +) +import argparse + +from evaluation.base_metric import BaseMetric + + +class SatisfactionMetric(BaseMetric): + def __init__( + self, + classifier: Optional[SatisfactionClassifierSVM] = None, + name: str = "satisfaction", + ): + super().__init__(name) + self.classifier = classifier or SatisfactionClassifierSVM() + + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the satisfaction score for a single dialogue.""" + return self.classifier.classify_last_n_dialogue(dialogue, last_n=None) + + @staticmethod + def parse_args() -> argparse.Namespace: + """Parses command-line arguments. + + Returns: + Parsed arguments. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--dialogues", + type=str, + required=True, + help="Path to the dialogues.", + ) + return parser.parse_args() diff --git a/usersimcrs/evaluation/__init__.py b/usersimcrs/evaluation/__init__.py new file mode 100644 index 00000000..c55a4339 --- /dev/null +++ b/usersimcrs/evaluation/__init__.py @@ -0,0 +1,5 @@ +"""Evaluation metrics for dialogue systems.""" + +from usersimcrs.evaluation.base_metric import BaseMetric + +__all__ = ["BaseMetric"] diff --git a/scripts/evaluation/base_metric.py b/usersimcrs/evaluation/base_metric.py similarity index 100% rename from scripts/evaluation/base_metric.py rename to usersimcrs/evaluation/base_metric.py From e6f0ef37e7309c248f4d315da958b111dae97db8 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 15:33:47 +0100 Subject: [PATCH 19/38] move file --- scripts/evaluation/satisfaction_metric.py | 44 ----------------------- 1 file changed, 44 deletions(-) delete mode 100644 scripts/evaluation/satisfaction_metric.py diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py deleted file mode 100644 index 78bb6754..00000000 --- a/scripts/evaluation/satisfaction_metric.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Satisfaction metric class implementation. - -Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class. -""" - -from typing import Any, Optional - -from dialoguekit.core.dialogue import Dialogue -from dialoguekit.nlu.models.satisfaction_classifier import ( - SatisfactionClassifierSVM, -) -import argparse - -from evaluation.base_metric import BaseMetric - - -class SatisfactionMetric(BaseMetric): - def __init__( - self, - classifier: Optional[SatisfactionClassifierSVM] = None, - name: str = "satisfaction", - ): - super().__init__(name) - self.classifier = classifier or SatisfactionClassifierSVM() - - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: - """Computes the satisfaction score for a single dialogue.""" - return self.classifier.classify_last_n_dialogue(dialogue, last_n=None) - - @staticmethod - def parse_args() -> argparse.Namespace: - """Parses command-line arguments. - - Returns: - Parsed arguments. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--dialogues", - type=str, - required=True, - help="Path to the dialogues.", - ) - return parser.parse_args() From 431ae4041630683b38bd8a989b1d380b80e74c04 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 3 Mar 2026 19:33:56 +0100 Subject: [PATCH 20/38] refactoring --- scripts/evaluation/satisfaction_metric.py | 44 --- tests/evaluation/test_quality_metric.py | 27 +- tests/evaluation/test_satisfaction_metric.py | 2 +- tests/evaluation/test_utility_metric.py | 131 +++++--- .../evaluation/quality_metric.py | 80 ++--- .../evaluation}/quality_rubrics.py | 0 usersimcrs/evaluation/satisfaction_metric.py | 27 ++ .../evaluation/utility_metric.py | 287 ++++++++---------- 8 files changed, 273 insertions(+), 325 deletions(-) delete mode 100644 scripts/evaluation/satisfaction_metric.py rename {scripts => usersimcrs}/evaluation/quality_metric.py (56%) rename {scripts/evaluation/rubrics => usersimcrs/evaluation}/quality_rubrics.py (100%) create mode 100644 usersimcrs/evaluation/satisfaction_metric.py rename {scripts => usersimcrs}/evaluation/utility_metric.py (55%) diff --git a/scripts/evaluation/satisfaction_metric.py b/scripts/evaluation/satisfaction_metric.py deleted file mode 100644 index e774ae11..00000000 --- a/scripts/evaluation/satisfaction_metric.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Satisfaction metric class implementation. - -Wraps DialogueKit's satisfaction classifier into a `BaseMetric` class. -""" - -from typing import Any, Optional - -from dialoguekit.core.dialogue import Dialogue -from dialoguekit.nlu.models.satisfaction_classifier import ( - SatisfactionClassifierSVM, -) -import argparse - -from scripts.evaluation.base_metric import BaseMetric - - -class SatisfactionMetric(BaseMetric): - def __init__( - self, - classifier: Optional[SatisfactionClassifierSVM] = None, - name: str = "satisfaction", - ): - super().__init__(name) - self.classifier = classifier or SatisfactionClassifierSVM() - - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: - """Computes the satisfaction score for a single dialogue.""" - return self.classifier.classify_last_n_dialogue(dialogue, last_n=None) - - @staticmethod - def parse_args() -> argparse.Namespace: - """Parses command-line arguments. - - Returns: - Parsed arguments. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--dialogues", - type=str, - required=True, - help="Path to the dialogues.", - ) - return parser.parse_args() diff --git a/tests/evaluation/test_quality_metric.py b/tests/evaluation/test_quality_metric.py index d9882577..c6dac1b3 100644 --- a/tests/evaluation/test_quality_metric.py +++ b/tests/evaluation/test_quality_metric.py @@ -1,9 +1,9 @@ """Tests for QualityMetric.""" -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import pytest from dialoguekit.utils.dialogue_reader import json_to_dialogues -from scripts.evaluation.quality_metric import QualityMetric +from usersimcrs.evaluation.quality_metric import QualityMetric @pytest.fixture @@ -17,8 +17,8 @@ def dialogues(): @pytest.fixture -def mock_ollama(): - """Mock Ollama LLM interface.""" +def mock_llm_interface(): + """Mock LLM interface.""" interface = MagicMock() interface.get_llm_api_response.return_value = ( '{"score": 4, "score_explanation": "good"}' @@ -27,35 +27,32 @@ def mock_ollama(): @pytest.fixture -def metric(mock_ollama): - with patch.object( - QualityMetric, "_get_ollama_interface", return_value=mock_ollama - ): - yield QualityMetric(ollama_config_path="dummy_config.json") +def metric(mock_llm_interface): + return QualityMetric(llm_interface=mock_llm_interface) def test_evaluate_dialogue( - metric: QualityMetric, mock_ollama, dialogues + metric: QualityMetric, mock_llm_interface, dialogues ) -> None: """Test evaluate_dialogue returns score for REC_RELEVANCE aspect.""" dialogue = dialogues[0] score = metric.evaluate_dialogue(dialogue, aspect="REC_RELEVANCE") assert score == 4.0 - assert mock_ollama.get_llm_api_response.call_count == 1 + assert mock_llm_interface.get_llm_api_response.call_count == 1 def test_evaluate_dialogue_different_aspect( - metric: QualityMetric, mock_ollama, dialogues + metric: QualityMetric, mock_llm_interface, dialogues ) -> None: """Test evaluate_dialogue with FLUENCY aspect.""" dialogue = dialogues[0] score = metric.evaluate_dialogue(dialogue, aspect="FLUENCY") assert score == 4.0 - assert mock_ollama.get_llm_api_response.call_count == 1 + assert mock_llm_interface.get_llm_api_response.call_count == 1 def test_evaluate_dialogues( - metric: QualityMetric, mock_ollama, dialogues + metric: QualityMetric, mock_llm_interface, dialogues ) -> None: """Test evaluate_dialogues with for COM_STYLE aspect.""" result = metric.evaluate_dialogues(dialogues, aspect="COM_STYLE") @@ -63,4 +60,4 @@ def test_evaluate_dialogues( for dialogue in dialogues: assert dialogue.conversation_id in result assert result[dialogue.conversation_id] == 4.0 - assert mock_ollama.get_llm_api_response.call_count == len(dialogues) + assert mock_llm_interface.get_llm_api_response.call_count == len(dialogues) diff --git a/tests/evaluation/test_satisfaction_metric.py b/tests/evaluation/test_satisfaction_metric.py index 4d48c3dc..787c175b 100644 --- a/tests/evaluation/test_satisfaction_metric.py +++ b/tests/evaluation/test_satisfaction_metric.py @@ -3,7 +3,7 @@ from unittest.mock import MagicMock import pytest from dialoguekit.utils.dialogue_reader import json_to_dialogues -from scripts.evaluation.satisfaction_metric import SatisfactionMetric +from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric @pytest.fixture diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py index ec16d66c..37eedb25 100644 --- a/tests/evaluation/test_utility_metric.py +++ b/tests/evaluation/test_utility_metric.py @@ -1,4 +1,4 @@ -"""Tests for UtilityMetric.""" +"""Tests for utility metric classes.""" from unittest.mock import patch @@ -6,7 +6,11 @@ from dialoguekit.utils.dialogue_reader import json_to_dialogues -from scripts.evaluation.utility_metric import UtilityMetric +from usersimcrs.evaluation.utility_metric import ( + RewardPerDialogueLengthMetric, + SuccessRateMetric, + SuccessfulRecommendationRoundRatioMetric, +) @pytest.fixture @@ -19,57 +23,96 @@ def dialogues(): ) -FIXED_UTILITY = { - "success": 1.0, - "successful_recommendation_round_ratio": 0.5, - "reward_per_dialogue_length": 0.1, -} +@pytest.fixture +def success_rate_metric(): + return SuccessRateMetric( + user_nlu_config_path="dummy_user_nlu.yaml", + agent_nlu_config_path="dummy_agent_nlu.yaml", + ) + + +@pytest.fixture +def successful_round_ratio_metric(): + return SuccessfulRecommendationRoundRatioMetric( + user_nlu_config_path="dummy_user_nlu.yaml", + agent_nlu_config_path="dummy_agent_nlu.yaml", + ) @pytest.fixture -def metric(): - """UtilityMetric returning fixed metrics.""" - with patch.object( - UtilityMetric, "_get_utility_metrics", return_value=FIXED_UTILITY +def reward_per_dialogue_length_metric(): + return RewardPerDialogueLengthMetric( + user_nlu_config_path="dummy_user_nlu.yaml", + agent_nlu_config_path="dummy_agent_nlu.yaml", + ) + + +def test_success_rate_evaluate_dialogue( + success_rate_metric: SuccessRateMetric, dialogues +) -> None: + """Test SuccessRateMetric.evaluate_dialogue.""" + dialogue = dialogues[0] + with ( + patch.object( + SuccessRateMetric, "_prepare", return_value=(dialogue, [], [], []) + ), + patch.object(SuccessRateMetric, "_assess_dialogue", return_value=1), ): - yield UtilityMetric( - user_nlu_config_path="dummy_user_nlu.yaml", - agent_nlu_config_path="dummy_agent_nlu.yaml", - ) + assert success_rate_metric.evaluate_dialogue(dialogue) == 1.0 -def test_evaluate_dialogue(metric: UtilityMetric, dialogues) -> None: - """Test evaluate_dialogue returns selected metric.""" +def test_success_rate_evaluate_dialogue_unsuccessful( + success_rate_metric: SuccessRateMetric, dialogues +) -> None: + """Test SuccessRateMetric.evaluate_dialogue for failed dialogue.""" dialogue = dialogues[0] - assert metric.evaluate_dialogue(dialogue) == 1.0 - assert metric.evaluate_dialogue(dialogue, metric="success") == 1.0 - assert ( - metric.evaluate_dialogue( - dialogue, metric="successful_recommendation_round_ratio" - ) - == 0.5 - ) - assert ( - metric.evaluate_dialogue(dialogue, metric="reward_per_dialogue_length") - == 0.1 - ) + with ( + patch.object( + SuccessRateMetric, "_prepare", return_value=(dialogue, [], [], []) + ), + patch.object(SuccessRateMetric, "_assess_dialogue", return_value=0), + ): + assert success_rate_metric.evaluate_dialogue(dialogue) == 0.0 -def test_evaluate_dialogues(metric: UtilityMetric, dialogues) -> None: - """Test evaluate_dialogues returns conversation_id -> metric value.""" - result = metric.evaluate_dialogues(dialogues) - assert len(result) == len(dialogues) - for dialogue in dialogues: - assert dialogue.conversation_id in result - assert result[dialogue.conversation_id] == 1.0 +def test_successful_recommendation_round_ratio_evaluate_dialogue( + successful_round_ratio_metric: SuccessfulRecommendationRoundRatioMetric, + dialogues, +) -> None: + """Test SuccessfulRecommendationRoundRatioMetric.evaluate_dialogue.""" + dialogue = dialogues[0] + with ( + patch.object( + SuccessfulRecommendationRoundRatioMetric, + "_prepare", + return_value=(dialogue, [], [], []), + ), + patch.object( + SuccessfulRecommendationRoundRatioMetric, + "_assess_dialogue", + return_value=(1, 2), + ), + ): + assert successful_round_ratio_metric.evaluate_dialogue(dialogue) == 0.5 -def test_evaluate_dialogues_with_specified_metric( - metric: UtilityMetric, dialogues +def test_reward_per_dialogue_length_evaluate_dialogue( + reward_per_dialogue_length_metric: RewardPerDialogueLengthMetric, dialogues ) -> None: - """Test evaluate_dialogues with specified metric.""" - result = metric.evaluate_dialogues( - dialogues, metric="successful_recommendation_round_ratio" - ) - for dialogue in dialogues: - assert result[dialogue.conversation_id] == 0.5 + """Test RewardPerDialogueLengthMetric.evaluate_dialogue.""" + dialogue = dialogues[0] + with ( + patch.object( + RewardPerDialogueLengthMetric, + "_prepare", + return_value=(dialogue, [], [], []), + ), + patch.object( + RewardPerDialogueLengthMetric, + "_assess_dialogue", + return_value=(1, 10), + ), + ): + assert ( + reward_per_dialogue_length_metric.evaluate_dialogue(dialogue) == 0.1 + ) diff --git a/scripts/evaluation/quality_metric.py b/usersimcrs/evaluation/quality_metric.py similarity index 56% rename from scripts/evaluation/quality_metric.py rename to usersimcrs/evaluation/quality_metric.py index 5a2323fd..04e7bf94 100644 --- a/scripts/evaluation/quality_metric.py +++ b/usersimcrs/evaluation/quality_metric.py @@ -1,4 +1,4 @@ -"""Script to evaluate dialogue quality using an LLM. +"""LLM-based dialogue quality evaluation. The script evaluates dialogue quality with regards to five aspects: - Recommendation relevance @@ -11,19 +11,16 @@ dedicated rubric. The scoring is done using a large language model. """ -import argparse import json -from typing import Any, Optional, TYPE_CHECKING - -if TYPE_CHECKING: - pass +import logging +from typing import Any from dialoguekit.core.dialogue import Dialogue from dialoguekit.participant.participant import DialogueParticipant -from scripts.evaluation.base_metric import BaseMetric -from scripts.evaluation.rubrics.quality_rubrics import QualityRubrics -from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface +from usersimcrs.evaluation.base_metric import BaseMetric +from usersimcrs.evaluation.quality_rubrics import QualityRubrics +from usersimcrs.llm_interfaces.llm_interface import LLMInterface _PROMPT_EVAL_INTRO = ( @@ -42,50 +39,11 @@ class QualityMetric(BaseMetric): def __init__( self, - ollama_config_path: str, - default_response: str = "", + llm_interface: LLMInterface, name: str = "quality", ) -> None: super().__init__(name) - self.ollama_config_path = ollama_config_path - self.default_response = default_response - self._ollama_interface: Optional[OllamaLLMInterface] = None - - @staticmethod - def parse_args() -> argparse.Namespace: - """Parse command-line arguments. - - Returns: - Parsed arguments. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--dialogues", - type=str, - required=True, - help="Path to the dialogues.", - ) - parser.add_argument( - "--ollama_config", - type=str, - required=True, - help="Path to the Ollama config file.", - ) - parser.add_argument( - "--output", - type=str, - help="(optional) Path to the output file.", - ) - return parser.parse_args() - - def _get_ollama_interface(self) -> OllamaLLMInterface: - """Returns Ollama LLM interface.""" - if self._ollama_interface is None: - self._ollama_interface = OllamaLLMInterface( - self.ollama_config_path, - default_response=self.default_response, - ) - return self._ollama_interface + self.llm_interface = llm_interface def _get_prompt( self, grading_rubric: QualityRubrics, dialogue: Dialogue @@ -125,18 +83,26 @@ def evaluate_dialogue( Score (1-5) for the specified aspect. Raises: - ValueError: When the LLM response cannot be parsed. + KeyError: When the aspect does not exist in QualityRubrics. """ - aspect_enum = QualityRubrics[aspect] - ollama_interface = self._get_ollama_interface() + try: + aspect_enum = QualityRubrics[aspect] + except KeyError: + supported = [e.name for e in QualityRubrics] + raise KeyError( + f"Unknown aspect '{aspect}'. Supported aspects: {supported}" + ) prompt = self._get_prompt(aspect_enum, dialogue) - response = ollama_interface.get_llm_api_response(prompt) + response = self.llm_interface.get_llm_api_response(prompt) try: response = response.replace("\\", "\\\\") response_dict = json.loads(response) return float(response_dict["score"]) except Exception: - raise ValueError( - f"Failed to get score for {aspect} dialogue " - f"{dialogue.conversation_id}: {response}" + logging.warning( + "Failed to parse LLM response for %s dialogue %s: %s", + aspect, + dialogue.conversation_id, + response, ) + return 0.0 diff --git a/scripts/evaluation/rubrics/quality_rubrics.py b/usersimcrs/evaluation/quality_rubrics.py similarity index 100% rename from scripts/evaluation/rubrics/quality_rubrics.py rename to usersimcrs/evaluation/quality_rubrics.py diff --git a/usersimcrs/evaluation/satisfaction_metric.py b/usersimcrs/evaluation/satisfaction_metric.py new file mode 100644 index 00000000..d05fdbbb --- /dev/null +++ b/usersimcrs/evaluation/satisfaction_metric.py @@ -0,0 +1,27 @@ +"""Satisfaction metric class implementation. + +Satisfaction assessment based on DialogueKit classifier. +""" + +from typing import Any + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.nlu.models.satisfaction_classifier import ( + SatisfactionClassifier, +) + +from usersimcrs.evaluation.base_metric import BaseMetric + + +class SatisfactionMetric(BaseMetric): + def __init__( + self, + classifier: SatisfactionClassifier, + name: str = "satisfaction", + ): + super().__init__(name) + self.classifier = classifier + + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the satisfaction score for a single dialogue.""" + return self.classifier.classify_last_n_dialogue(dialogue, last_n=None) diff --git a/scripts/evaluation/utility_metric.py b/usersimcrs/evaluation/utility_metric.py similarity index 55% rename from scripts/evaluation/utility_metric.py rename to usersimcrs/evaluation/utility_metric.py index 88617683..c9932d50 100644 --- a/scripts/evaluation/utility_metric.py +++ b/usersimcrs/evaluation/utility_metric.py @@ -1,33 +1,32 @@ """Utility metric class implementation. -Encapsulates the logic from `utility_evaluation.py` into a `BaseMetric`. +Computes three utility metrics: + +- Success Rate (SR) +- Successful Recommendation Round Ratio (SRRR) +- Reward-per-Dialogue-Length (RDL) """ -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, List, Optional, Tuple from confuse import Configuration -import argparse from dialoguekit.core.annotated_utterance import AnnotatedUtterance from dialoguekit.core.dialogue import Dialogue from dialoguekit.core.intent import Intent from dialoguekit.nlu.nlu import NLU from dialoguekit.participant.participant import DialogueParticipant -from usersimcrs.utils.simulation_utils import get_NLU -from scripts.evaluation.base_metric import BaseMetric +from usersimcrs.evaluation.base_metric import BaseMetric +from usersimcrs.utils.simulation_utils import get_NLU -class UtilityMetric(BaseMetric): - """Computes utility metrics for dialogues. - - Constructor takes paths to user and agent NLU configuration files. - """ +class UtilityMetricBase(BaseMetric): def __init__( self, user_nlu_config_path: str, agent_nlu_config_path: str, - name: str = "utility", + name: str, ): super().__init__(name) self.user_nlu_config_path = user_nlu_config_path @@ -35,54 +34,6 @@ def __init__( self._user_nlu: Optional[NLU] = None self._agent_nlu: Optional[NLU] = None - @classmethod - def parse_args(self) -> argparse.Namespace: - """Parses command-line arguments. - - Returns: - Parsed command-line arguments. - """ - parser = argparse.ArgumentParser(prog="utility_evaluation.py") - parser.add_argument( - "annotated_dialogues", - type=str, - help="Annotated dialogues JSON file.", - ) - parser.add_argument( - "user_nlu_config", - type=str, - help="User NLU configuration file.", - ) - parser.add_argument( - "agent_nlu_config", - type=str, - help="Agent NLU configuration file.", - ) - parser.add_argument( - "--reject_intent_labels", - nargs="+", - default=["REJ"], - help="Intent labels corresponding to rejection.", - ) - parser.add_argument( - "--accept_intent_labels", - nargs="+", - default=["ACC"], - help="Intent labels corresponding to acceptance.", - ) - parser.add_argument( - "--recommendation_intent_labels", - nargs="+", - default=["REC-S", "REC-E"], - help="Intent labels corresponding to recommendation.", - ) - parser.add_argument( - "--output", - type=str, - help="Output file to save annotated dialogues with utility metrics", - ) - return parser.parse_args() - def _annotate_dialogue( self, dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU ) -> Dialogue: @@ -119,26 +70,6 @@ def _annotate_dialogue( ) return dialogue - def _annotate_dialogues( - self, dialogues: List[Dialogue], user_nlu: NLU, agent_nlu: NLU - ) -> List[Dialogue]: - """Annotates dialogues with dialogue acts. - - Args: - dialogues: Dialogues. - user_nlu: User NLU module. - agent_nlu: Agent NLU module. - - Returns: - Annotated dialogues. - """ - # TODO: Move this to DialogueKit - # See: https://github.com/iai-group/UserSimCRS/issues/219 - return [ - self._annotate_dialogue(dialogue, user_nlu, agent_nlu) - for dialogue in dialogues - ] - def _get_recommendation_rounds( self, dialogue: Dialogue, recommendation_intents: List[Intent] ) -> List[List[AnnotatedUtterance]]: @@ -191,48 +122,6 @@ def _is_recommendation_accepted( return False return b_accepted - def _assess_dialogue( - self, - dialogue: Dialogue, - recommendation_intents: List[Intent], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], - ) -> Tuple[int, int, int]: - """Assesses the utility of the dialogue. - - Args: - dialogue: Dialogue. - recommendation_intents: Intents corresponding to recommendation. - acceptance_intents: Intents corresponding to acceptance. - rejection_intents: Intents corresponding to rejection. - - Returns: - Tuple of number of accepted recommendations, successful - recommendation rounds and total recommendation rounds. - """ - # TODO: Optimize overall assessment to avoid multiple iterations over - # utterances. - rounds = self._get_recommendation_rounds( - dialogue, recommendation_intents - ) - successful_rounds = 0 - for round in rounds: - if self._is_recommendation_accepted( - round, acceptance_intents, rejection_intents - ): - successful_rounds += 1 - - nb_accepted_recommendations = sum( - 1 - for utterance in dialogue.utterances - if utterance.participant == DialogueParticipant.USER - and any( - intent in acceptance_intents - for intent in utterance.get_intents() - ) - ) - return nb_accepted_recommendations, successful_rounds, len(rounds) - def _load_nlus(self) -> Tuple[NLU, NLU]: """Returns (cached) user and agent NLU modules.""" if self._user_nlu is None: @@ -260,55 +149,125 @@ def _get_intent_lists(self, **kwargs: Any) -> Tuple[List[Intent], ...]: [Intent(label) for label in rej_labels], ) - def _get_utility_metrics( + def _prepare( self, dialogue: Dialogue, **kwargs: Any - ) -> Dict[str, float]: - """Returns full utility dict for one dialogue.""" + ) -> Tuple[Dialogue, List[Intent], List[Intent], List[Intent]]: + """Annotates dialogue. + + Returns: + dialogue + rec_intents: Recommendation intents. + acc_intents: Acceptance intents. + rej_intents: Rejection intents. + """ user_nlu, agent_nlu = self._load_nlus() self._annotate_dialogue(dialogue, user_nlu, agent_nlu) - ( - recommendation_intents, - acceptance_intents, - rejection_intents, - ) = self._get_intent_lists(**kwargs) - ( - nb_accepted_recommendations, - successful_rounds, - total_rounds, - ) = self._assess_dialogue( - dialogue, - recommendation_intents, - acceptance_intents, - rejection_intents, + rec, acc, rej = self._get_intent_lists(**kwargs) + return dialogue, rec, acc, rej + + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the metric for a single dialogue.""" + raise NotImplementedError() + + +class SuccessRateMetric(UtilityMetricBase): + def __init__( + self, + user_nlu_config_path: str, + agent_nlu_config_path: str, + name: str = "success_rate", + ): + super().__init__(user_nlu_config_path, agent_nlu_config_path, name) + + def _assess_dialogue( + self, + dialogue: Dialogue, + recommendation_intents: List[Intent], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], + ) -> int: + """Returns number of successful recommendation rounds.""" + rounds = self._get_recommendation_rounds( + dialogue, recommendation_intents + ) + return sum( + 1 + for round_utterances in rounds + if self._is_recommendation_accepted( + round_utterances, acceptance_intents, rejection_intents + ) ) - return { - "success": float(successful_rounds > 0), - "successful_recommendation_round_ratio": ( - successful_rounds / total_rounds if total_rounds > 0 else 0.0 - ), - "reward_per_dialogue_length": ( - nb_accepted_recommendations / len(dialogue.utterances) - if dialogue.utterances - else 0.0 - ), - } def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: - """Computes one utility metric for a dialogue. + dlg, rec, acc, rej = self._prepare(dialogue, **kwargs) + successful_rounds = self._assess_dialogue(dlg, rec, acc, rej) + return float(successful_rounds > 0) - Args: - dialogue: Dialogue to evaluate. - metric: One of "success", "successful_recommendation_round_ratio", - "reward_per_dialogue_length". Default "success". - Returns: - The selected metric value as float. - """ - metrics = self._get_utility_metrics(dialogue, **kwargs) - metric = kwargs.get("metric", "success") - if metric not in metrics: - raise ValueError( - f"Unknown metric '{metric}'. " - f"Expected one of {list(metrics.keys())}" +class SuccessfulRecommendationRoundRatioMetric(UtilityMetricBase): + def __init__( + self, + user_nlu_config_path: str, + agent_nlu_config_path: str, + name: str = "successful_recommendation_round_ratio", + ): + super().__init__(user_nlu_config_path, agent_nlu_config_path, name) + + def _assess_dialogue( + self, + dialogue: Dialogue, + recommendation_intents: List[Intent], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], + ) -> Tuple[int, int]: + """Returns successful rounds and total rounds.""" + rounds = self._get_recommendation_rounds( + dialogue, recommendation_intents + ) + successful_rounds = sum( + 1 + for round_utterances in rounds + if self._is_recommendation_accepted( + round_utterances, acceptance_intents, rejection_intents ) - return metrics[metric] + ) + return successful_rounds, len(rounds) + + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + dlg, rec, acc, rej = self._prepare(dialogue, **kwargs) + successful_rounds, total_rounds = self._assess_dialogue( + dlg, rec, acc, rej + ) + return successful_rounds / total_rounds if total_rounds > 0 else 0.0 + + +class RewardPerDialogueLengthMetric(UtilityMetricBase): + def __init__( + self, + user_nlu_config_path: str, + agent_nlu_config_path: str, + name: str = "reward_per_dialogue_length", + ): + super().__init__(user_nlu_config_path, agent_nlu_config_path, name) + + def _assess_dialogue( + self, dialogue: Dialogue, acceptance_intents: List[Intent] + ) -> Tuple[int, int]: + """Returns accepted recommendations and dialogue length.""" + nb_accepted_recommendations = sum( + 1 + for utterance in dialogue.utterances + if utterance.participant == DialogueParticipant.USER + and any( + intent in acceptance_intents + for intent in utterance.get_intents() + ) + ) + return nb_accepted_recommendations, len(dialogue.utterances) + + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + dlg, _, acc, _ = self._prepare(dialogue, **kwargs) + nb_accepted_recommendations, dialogue_length = self._assess_dialogue( + dlg, acc + ) + return nb_accepted_recommendations / dialogue_length From 773822d8d3a4c7fe9ca6ea46c05c2edd18832ddc Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 10 Mar 2026 13:42:41 +0100 Subject: [PATCH 21/38] fixes --- tests/evaluation/test_utility_metric.py | 36 ++- usersimcrs/evaluation/dialogue_annotation.py | 211 ++++++++++++++ usersimcrs/evaluation/quality_metric.py | 33 ++- .../reward_per_dialogue_length_metric.py | 87 ++++++ usersimcrs/evaluation/satisfaction_metric.py | 12 +- usersimcrs/evaluation/success_rate_metric.py | 93 ++++++ ...ssful_recommendation_round_ratio_metric.py | 97 +++++++ usersimcrs/evaluation/utility_metric.py | 273 ------------------ 8 files changed, 542 insertions(+), 300 deletions(-) create mode 100644 usersimcrs/evaluation/dialogue_annotation.py create mode 100644 usersimcrs/evaluation/reward_per_dialogue_length_metric.py create mode 100644 usersimcrs/evaluation/success_rate_metric.py create mode 100644 usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py delete mode 100644 usersimcrs/evaluation/utility_metric.py diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py index 37eedb25..4862ba6c 100644 --- a/tests/evaluation/test_utility_metric.py +++ b/tests/evaluation/test_utility_metric.py @@ -1,17 +1,21 @@ """Tests for utility metric classes.""" -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest from dialoguekit.utils.dialogue_reader import json_to_dialogues -from usersimcrs.evaluation.utility_metric import ( +from usersimcrs.evaluation.reward_per_dialogue_length_metric import ( RewardPerDialogueLengthMetric, - SuccessRateMetric, +) +from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric +from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import ( SuccessfulRecommendationRoundRatioMetric, ) +_MOCK_NLU = MagicMock() + @pytest.fixture def dialogues(): @@ -53,8 +57,9 @@ def test_success_rate_evaluate_dialogue( """Test SuccessRateMetric.evaluate_dialogue.""" dialogue = dialogues[0] with ( - patch.object( - SuccessRateMetric, "_prepare", return_value=(dialogue, [], [], []) + patch( + "usersimcrs.evaluation.success_rate_metric.prepare_dialogue", + return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU), ), patch.object(SuccessRateMetric, "_assess_dialogue", return_value=1), ): @@ -67,8 +72,9 @@ def test_success_rate_evaluate_dialogue_unsuccessful( """Test SuccessRateMetric.evaluate_dialogue for failed dialogue.""" dialogue = dialogues[0] with ( - patch.object( - SuccessRateMetric, "_prepare", return_value=(dialogue, [], [], []) + patch( + "usersimcrs.evaluation.success_rate_metric.prepare_dialogue", + return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU), ), patch.object(SuccessRateMetric, "_assess_dialogue", return_value=0), ): @@ -82,10 +88,10 @@ def test_successful_recommendation_round_ratio_evaluate_dialogue( """Test SuccessfulRecommendationRoundRatioMetric.evaluate_dialogue.""" dialogue = dialogues[0] with ( - patch.object( - SuccessfulRecommendationRoundRatioMetric, - "_prepare", - return_value=(dialogue, [], [], []), + patch( + "usersimcrs.evaluation.successful_recommendation_round_ratio_metric" + ".prepare_dialogue", + return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU), ), patch.object( SuccessfulRecommendationRoundRatioMetric, @@ -102,10 +108,10 @@ def test_reward_per_dialogue_length_evaluate_dialogue( """Test RewardPerDialogueLengthMetric.evaluate_dialogue.""" dialogue = dialogues[0] with ( - patch.object( - RewardPerDialogueLengthMetric, - "_prepare", - return_value=(dialogue, [], [], []), + patch( + "usersimcrs.evaluation.reward_per_dialogue_length_metric" + ".prepare_dialogue", + return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU), ), patch.object( RewardPerDialogueLengthMetric, diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py new file mode 100644 index 00000000..06bc2572 --- /dev/null +++ b/usersimcrs/evaluation/dialogue_annotation.py @@ -0,0 +1,211 @@ +"""Dialogue annotation and recommendation round utilities. + +Provides functions for annotating dialogues with dialogue acts using NLU +modules, parsing intent labels, and extracting recommendation rounds from +annotated dialogues. +""" + +from typing import Any, List, Optional, Tuple + +from confuse import Configuration + +from dialoguekit.core.annotated_utterance import AnnotatedUtterance +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent +from dialoguekit.nlu.nlu import NLU +from dialoguekit.participant.participant import DialogueParticipant + +from usersimcrs.utils.simulation_utils import get_NLU + + +def annotate_dialogue( + dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU +) -> Dialogue: + """Annotates utterances with dialogue acts. + + Each utterance that is not already an AnnotatedUtterance is converted to + one. Utterances that already carry dialogue acts are left untouched. + + Args: + dialogue: Dialogue to be annotated. + user_nlu: NLU module for user utterances. + agent_nlu: NLU module for agent utterances. + + Raises: + ValueError: If an utterance has an unknown participant. + + Returns: + The same dialogue object with annotated utterances. + """ + for i, utterance in enumerate(dialogue.utterances): + if not isinstance(utterance, AnnotatedUtterance): + dialogue.utterances[i] = AnnotatedUtterance.from_utterance( + utterance + ) + + if len(utterance.dialogue_acts) > 0: + continue + + if utterance.participant == DialogueParticipant.USER: + dialogue.utterances[ + i + ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance) + elif utterance.participant == DialogueParticipant.AGENT: + dialogue.utterances[ + i + ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance) + else: + raise ValueError(f"Unknown participant: {utterance.participant}") + return dialogue + + +def load_nlus( + user_nlu_config_path: str, + agent_nlu_config_path: str, + cached_user_nlu: Optional[NLU] = None, + cached_agent_nlu: Optional[NLU] = None, +) -> Tuple[NLU, NLU]: + """Loads user and agent NLU modules. + + Returns cached instances when provided, otherwise creates new ones + from the given configuration files. + + Args: + user_nlu_config_path: Path to user NLU configuration file. + agent_nlu_config_path: Path to agent NLU configuration file. + cached_user_nlu: Previously loaded user NLU module. + cached_agent_nlu: Previously loaded agent NLU module. + + Returns: + Tuple of (user_nlu, agent_nlu) modules. + """ + if cached_user_nlu is None: + user_nlu_config = Configuration("User NLU Configuration") + user_nlu_config.set_file(user_nlu_config_path) + cached_user_nlu = get_NLU(user_nlu_config) + if cached_agent_nlu is None: + agent_nlu_config = Configuration("Agent NLU Configuration") + agent_nlu_config.set_file(agent_nlu_config_path) + cached_agent_nlu = get_NLU(agent_nlu_config) + return cached_user_nlu, cached_agent_nlu + + +def get_intent_lists( + **kwargs: Any, +) -> Tuple[List[Intent], List[Intent], List[Intent]]: + """Builds recommendation, acceptance, and rejection intent lists. + + Args: + **kwargs: Optional intent label overrides: + - recommendation_intent_labels: Labels for recommendation intents. + Defaults to ``["REC-S", "REC-E"]``. + - acceptance_intent_labels: Labels for acceptance intents. + Defaults to ``["ACC"]``. + - rejection_intent_labels: Labels for rejection intents. + Defaults to ``["REJ"]``. + + Returns: + Tuple of (recommendation_intents, acceptance_intents, + rejection_intents). + """ + rec_labels = kwargs.get("recommendation_intent_labels", ["REC-S", "REC-E"]) + acc_labels = kwargs.get("acceptance_intent_labels", ["ACC"]) + rej_labels = kwargs.get("rejection_intent_labels", ["REJ"]) + return ( + [Intent(label) for label in rec_labels], + [Intent(label) for label in acc_labels], + [Intent(label) for label in rej_labels], + ) + + +def get_recommendation_rounds( + dialogue: Dialogue, recommendation_intents: List[Intent] +) -> List[List[AnnotatedUtterance]]: + """Splits a dialogue into recommendation rounds. + + A new round begins each time an utterance contains a recommendation + intent. + + Args: + dialogue: Annotated dialogue. + recommendation_intents: Intents that signal a recommendation. + + Returns: + List of utterance groups, one per recommendation round. + """ + rounds: List[List[AnnotatedUtterance]] = [] + current_round: List[AnnotatedUtterance] = [] + for utterance in dialogue.utterances: + if any( + intent in utterance.get_intents() + for intent in recommendation_intents + ): + if current_round: + rounds.append(current_round) + current_round = [utterance] + else: + current_round.append(utterance) + return rounds + + +def prepare_dialogue( + dialogue: Dialogue, + user_nlu_config_path: str, + agent_nlu_config_path: str, + cached_user_nlu: Optional[NLU] = None, + cached_agent_nlu: Optional[NLU] = None, + **kwargs: Any, +) -> Tuple[Dialogue, List[Intent], List[Intent], List[Intent], NLU, NLU]: + """Loads NLU modules, annotates a dialogue, and builds intent lists. + + Combines :func:`load_nlus`, :func:`annotate_dialogue`, and + :func:`get_intent_lists` into a single convenience call. + + Args: + dialogue: Dialogue to prepare. + user_nlu_config_path: Path to user NLU configuration file. + agent_nlu_config_path: Path to agent NLU configuration file. + cached_user_nlu: Previously loaded user NLU module (avoids reload). + cached_agent_nlu: Previously loaded agent NLU module (avoids reload). + **kwargs: Optional intent label overrides forwarded to + :func:`get_intent_lists`. + + Returns: + Tuple of (annotated dialogue, recommendation intents, + acceptance intents, rejection intents, user NLU, agent NLU). + """ + user_nlu, agent_nlu = load_nlus( + user_nlu_config_path, + agent_nlu_config_path, + cached_user_nlu, + cached_agent_nlu, + ) + annotate_dialogue(dialogue, user_nlu, agent_nlu) + rec, acc, rej = get_intent_lists(**kwargs) + return dialogue, rec, acc, rej, user_nlu, agent_nlu + + +def is_recommendation_accepted( + round_utterances: List[AnnotatedUtterance], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], +) -> bool: + """Assesses whether a recommendation round was accepted. + + Args: + round_utterances: Utterances in the recommendation round. + acceptance_intents: Intents corresponding to acceptance. + rejection_intents: Intents corresponding to rejection. + + Returns: + True if the recommendation was accepted, False otherwise. + """ + b_accepted = False + for utterance in round_utterances: + if utterance.participant == DialogueParticipant.USER: + intents = utterance.get_intents() + if any(intent in acceptance_intents for intent in intents): + b_accepted = True + elif any(intent in rejection_intents for intent in intents): + return False + return b_accepted diff --git a/usersimcrs/evaluation/quality_metric.py b/usersimcrs/evaluation/quality_metric.py index 04e7bf94..3198c77b 100644 --- a/usersimcrs/evaluation/quality_metric.py +++ b/usersimcrs/evaluation/quality_metric.py @@ -13,7 +13,7 @@ import json import logging -from typing import Any +from typing import Any, Literal from dialoguekit.core.dialogue import Dialogue from dialoguekit.participant.participant import DialogueParticipant @@ -42,6 +42,12 @@ def __init__( llm_interface: LLMInterface, name: str = "quality", ) -> None: + """Initializes the quality metric. + + Args: + llm_interface: LLM interface used for scoring. + name: Metric name. + """ super().__init__(name) self.llm_interface = llm_interface @@ -71,19 +77,28 @@ def _get_prompt( return prompt def evaluate_dialogue( - self, dialogue: Dialogue, aspect: str, **kwargs: Any + self, + dialogue: Dialogue, + aspect: Literal[ + "REC_RELEVANCE", + "COM_STYLE", + "FLUENCY", + "CONV_FLOW", + "OVERALL_SAT", + ], + **kwargs: Any, ) -> float: """Returns score for a single aspect of a dialogue. Args: dialogue: Dialogue to evaluate. - aspect: Aspect to evaluate. Must be one of QualityRubrics enum names - - Returns: - Score (1-5) for the specified aspect. + aspect: Aspect to evaluate. One of QualityRubrics enum names. Raises: KeyError: When the aspect does not exist in QualityRubrics. + + Returns: + Score (1-5) for the specified aspect. """ try: aspect_enum = QualityRubrics[aspect] @@ -100,9 +115,7 @@ def evaluate_dialogue( return float(response_dict["score"]) except Exception: logging.warning( - "Failed to parse LLM response for %s dialogue %s: %s", - aspect, - dialogue.conversation_id, - response, + f"Failed to parse LLM response for {aspect} dialogue " + f"{dialogue.conversation_id}: {response}", ) return 0.0 diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py new file mode 100644 index 00000000..c094e7a7 --- /dev/null +++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py @@ -0,0 +1,87 @@ +"""Reward-per-Dialogue-Length (RDL) metric implementation. + +Evaluates the ratio of accepted recommendations to total dialogue length. +""" + +from typing import Any, List, Optional, Tuple + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent +from dialoguekit.nlu.nlu import NLU +from dialoguekit.participant.participant import DialogueParticipant + +from usersimcrs.evaluation.base_metric import BaseMetric +from usersimcrs.evaluation.dialogue_annotation import prepare_dialogue + + +class RewardPerDialogueLengthMetric(BaseMetric): + """Measures accepted recommendations relative to dialogue length. + + Returns the number of accepted recommendations divided by the total number + of utterances. + """ + + def __init__( + self, + user_nlu_config_path: str, + agent_nlu_config_path: str, + name: str = "reward_per_dialogue_length", + ) -> None: + """Initializes the reward-per-dialogue-length metric. + + Args: + user_nlu_config_path: Path to user NLU configuration. + agent_nlu_config_path: Path to agent NLU configuration. + name: Metric name. + """ + super().__init__(name) + self._user_nlu_config_path = user_nlu_config_path + self._agent_nlu_config_path = agent_nlu_config_path + self._user_nlu: Optional[NLU] = None + self._agent_nlu: Optional[NLU] = None + + def _assess_dialogue( + self, dialogue: Dialogue, acceptance_intents: List[Intent] + ) -> Tuple[int, int]: + """Returns accepted recommendations and dialogue length. + + Args: + dialogue: Annotated dialogue. + acceptance_intents: Intents that signal acceptance. + + Returns: + Tuple of (accepted_recommendations, dialogue_length). + """ + nb_accepted_recommendations = sum( + 1 + for utterance in dialogue.utterances + if utterance.participant == DialogueParticipant.USER + and any( + intent in acceptance_intents + for intent in utterance.get_intents() + ) + ) + return nb_accepted_recommendations, len(dialogue.utterances) + + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the reward-per-dialogue-length score. + + Args: + dialogue: Dialogue to evaluate. + **kwargs: Optional intent label overrides. + + Returns: + Ratio of accepted recommendations to total utterances. + """ + dlg, _, acc, _, self._user_nlu, self._agent_nlu = prepare_dialogue( + dialogue, + self._user_nlu_config_path, + self._agent_nlu_config_path, + self._user_nlu, + self._agent_nlu, + **kwargs, + ) + nb_accepted_recommendations, dialogue_length = self._assess_dialogue( + dlg, acc + ) + return nb_accepted_recommendations / dialogue_length diff --git a/usersimcrs/evaluation/satisfaction_metric.py b/usersimcrs/evaluation/satisfaction_metric.py index d05fdbbb..664f91c0 100644 --- a/usersimcrs/evaluation/satisfaction_metric.py +++ b/usersimcrs/evaluation/satisfaction_metric.py @@ -18,10 +18,18 @@ def __init__( self, classifier: SatisfactionClassifier, name: str = "satisfaction", - ): + ) -> None: + """Initializes the satisfaction metric. + + Args: + classifier: Satisfaction classifier instance. + name: Metric name. + """ super().__init__(name) self.classifier = classifier def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: """Computes the satisfaction score for a single dialogue.""" - return self.classifier.classify_last_n_dialogue(dialogue, last_n=None) + return float( + self.classifier.classify_last_n_dialogue(dialogue, last_n=None) + ) diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py new file mode 100644 index 00000000..3c689c71 --- /dev/null +++ b/usersimcrs/evaluation/success_rate_metric.py @@ -0,0 +1,93 @@ +"""Success Rate (SR) metric implementation. + +Evaluates whether at least one recommendation was accepted during a dialogue. +""" + +from typing import Any, List, Optional + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent +from dialoguekit.nlu.nlu import NLU + +from usersimcrs.evaluation.base_metric import BaseMetric +from usersimcrs.evaluation.dialogue_annotation import ( + get_recommendation_rounds, + is_recommendation_accepted, + prepare_dialogue, +) + + +class SuccessRateMetric(BaseMetric): + """Measures whether a dialogue contains at least one accepted + recommendation. + + Returns 1.0 if at least one recommendation round was successful, + 0.0 otherwise. + """ + + def __init__( + self, + user_nlu_config_path: str, + agent_nlu_config_path: str, + name: str = "success_rate", + ) -> None: + """Initializes the success rate metric. + + Args: + user_nlu_config_path: Path to user NLU configuration. + agent_nlu_config_path: Path to agent NLU configuration. + name: Metric name. + """ + super().__init__(name) + self._user_nlu_config_path = user_nlu_config_path + self._agent_nlu_config_path = agent_nlu_config_path + self._user_nlu: Optional[NLU] = None + self._agent_nlu: Optional[NLU] = None + + def _assess_dialogue( + self, + dialogue: Dialogue, + recommendation_intents: List[Intent], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], + ) -> int: + """Returns number of successful recommendation rounds. + + Args: + dialogue: Annotated dialogue. + recommendation_intents: Intents that signal a recommendation. + acceptance_intents: Intents that signal acceptance. + rejection_intents: Intents that signal rejection. + + Returns: + Number of recommendation rounds that were accepted. + """ + rounds = get_recommendation_rounds(dialogue, recommendation_intents) + return sum( + 1 + for round_utterances in rounds + if is_recommendation_accepted( + round_utterances, acceptance_intents, rejection_intents + ) + ) + + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the success rate for a single dialogue. + + Args: + dialogue: Dialogue to evaluate. + **kwargs: Optional intent label overrides. + + Returns: + 1.0 if at least one recommendation was accepted, 0.0 otherwise. + """ + dlg, rec, acc, rej, self._user_nlu, self._agent_nlu = prepare_dialogue( + dialogue, + self._user_nlu_config_path, + self._agent_nlu_config_path, + self._user_nlu, + self._agent_nlu, + **kwargs, + ) + successful_rounds = self._assess_dialogue(dlg, rec, acc, rej) + return float(successful_rounds > 0) diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py new file mode 100644 index 00000000..b8de7013 --- /dev/null +++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py @@ -0,0 +1,97 @@ +"""Successful Recommendation Round Ratio (SRRR) metric implementation. + +Evaluates the ratio of accepted recommendation rounds to total recommendation +rounds in a dialogue. +""" + +from typing import Any, List, Optional, Tuple + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent +from dialoguekit.nlu.nlu import NLU + +from usersimcrs.evaluation.base_metric import BaseMetric +from usersimcrs.evaluation.dialogue_annotation import ( + get_recommendation_rounds, + is_recommendation_accepted, + prepare_dialogue, +) + + +class SuccessfulRecommendationRoundRatioMetric(BaseMetric): + """Measures the fraction of recommendation rounds that were accepted. + + Returns a value between 0.0 and 1.0 (or 0.0 when there are no recommendation + rounds). + """ + + def __init__( + self, + user_nlu_config_path: str, + agent_nlu_config_path: str, + name: str = "successful_recommendation_round_ratio", + ) -> None: + """Initializes the successful recommendation round ratio metric. + + Args: + user_nlu_config_path: Path to user NLU configuration. + agent_nlu_config_path: Path to agent NLU configuration. + name: Metric name. + """ + super().__init__(name) + self._user_nlu_config_path = user_nlu_config_path + self._agent_nlu_config_path = agent_nlu_config_path + self._user_nlu: Optional[NLU] = None + self._agent_nlu: Optional[NLU] = None + + def _assess_dialogue( + self, + dialogue: Dialogue, + recommendation_intents: List[Intent], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], + ) -> Tuple[int, int]: + """Returns successful and total recommendation rounds. + + Args: + dialogue: Annotated dialogue. + recommendation_intents: Intents that signal a recommendation. + acceptance_intents: Intents that signal acceptance. + rejection_intents: Intents that signal rejection. + + Returns: + Tuple of (successful_rounds, total_rounds). + """ + rounds = get_recommendation_rounds(dialogue, recommendation_intents) + successful_rounds = sum( + 1 + for round_utterances in rounds + if is_recommendation_accepted( + round_utterances, acceptance_intents, rejection_intents + ) + ) + return successful_rounds, len(rounds) + + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the successful recommendation round ratio. + + Args: + dialogue: Dialogue to evaluate. + **kwargs: Optional intent label overrides. + + Returns: + Ratio of accepted recommendation rounds to total rounds, + or 0.0 if there are no recommendation rounds. + """ + dlg, rec, acc, rej, self._user_nlu, self._agent_nlu = prepare_dialogue( + dialogue, + self._user_nlu_config_path, + self._agent_nlu_config_path, + self._user_nlu, + self._agent_nlu, + **kwargs, + ) + successful_rounds, total_rounds = self._assess_dialogue( + dlg, rec, acc, rej + ) + return successful_rounds / total_rounds if total_rounds > 0 else 0.0 diff --git a/usersimcrs/evaluation/utility_metric.py b/usersimcrs/evaluation/utility_metric.py deleted file mode 100644 index c9932d50..00000000 --- a/usersimcrs/evaluation/utility_metric.py +++ /dev/null @@ -1,273 +0,0 @@ -"""Utility metric class implementation. - -Computes three utility metrics: - -- Success Rate (SR) -- Successful Recommendation Round Ratio (SRRR) -- Reward-per-Dialogue-Length (RDL) -""" - -from typing import Any, List, Optional, Tuple - -from confuse import Configuration - -from dialoguekit.core.annotated_utterance import AnnotatedUtterance -from dialoguekit.core.dialogue import Dialogue -from dialoguekit.core.intent import Intent -from dialoguekit.nlu.nlu import NLU -from dialoguekit.participant.participant import DialogueParticipant - -from usersimcrs.evaluation.base_metric import BaseMetric -from usersimcrs.utils.simulation_utils import get_NLU - - -class UtilityMetricBase(BaseMetric): - def __init__( - self, - user_nlu_config_path: str, - agent_nlu_config_path: str, - name: str, - ): - super().__init__(name) - self.user_nlu_config_path = user_nlu_config_path - self.agent_nlu_config_path = agent_nlu_config_path - self._user_nlu: Optional[NLU] = None - self._agent_nlu: Optional[NLU] = None - - def _annotate_dialogue( - self, dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU - ) -> Dialogue: - """Annotates utterances with dialogue acts. - - Args: - dialogue: Dialogue to be annotated. - user_nlu: User NLU module. - agent_nlu: Agent NLU module. - - Returns: - Annotated dialogue. - """ - for i, utterance in enumerate(dialogue.utterances): - if not isinstance(utterance, AnnotatedUtterance): - dialogue.utterances[i] = AnnotatedUtterance.from_utterance( - utterance - ) - - if len(utterance.dialogue_acts) > 0: - continue - - if utterance.participant == DialogueParticipant.USER: - dialogue.utterances[ - i - ].dialogue_acts = user_nlu.extract_dialogue_acts(utterance) - elif utterance.participant == DialogueParticipant.AGENT: - dialogue.utterances[ - i - ].dialogue_acts = agent_nlu.extract_dialogue_acts(utterance) - else: - raise ValueError( - f"Unknown participant: {utterance.participant}" - ) - return dialogue - - def _get_recommendation_rounds( - self, dialogue: Dialogue, recommendation_intents: List[Intent] - ) -> List[List[AnnotatedUtterance]]: - """Gets utterances per recommendation round. - - Args: - dialogue: Dialogue. - recommendation_intents: Intents corresponding to recommendation. - - Returns: - Utterances per recommendation round. - """ - rounds: List[List[AnnotatedUtterance]] = [] - current_round: List[AnnotatedUtterance] = [] - for utterance in dialogue.utterances: - if any( - intent in utterance.get_intents() - for intent in recommendation_intents - ): - if current_round: - rounds.append(current_round) - current_round = [utterance] - else: - current_round.append(utterance) - return rounds - - def _is_recommendation_accepted( - self, - round: List[AnnotatedUtterance], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], - ) -> bool: - """Assesses whether the recommendation was accepted. - - Args: - round: Utterances in recommendation round. - acceptance_intents: Intents corresponding to acceptance. - rejection_intents: Intents corresponding to rejection. - - Returns: - True if the recommendation was accepted, False otherwise. - """ - b_accepted = False - for utterance in round: - if utterance.participant == DialogueParticipant.USER: - intents = utterance.get_intents() - if any(intent in acceptance_intents for intent in intents): - b_accepted = True - elif any(intent in rejection_intents for intent in intents): - return False - return b_accepted - - def _load_nlus(self) -> Tuple[NLU, NLU]: - """Returns (cached) user and agent NLU modules.""" - if self._user_nlu is None: - # NLU module for user utterances - user_nlu_config = Configuration("User NLU Configuration") - user_nlu_config.set_file(self.user_nlu_config_path) - self._user_nlu = get_NLU(user_nlu_config) - if self._agent_nlu is None: - # NLU module for agent utterances - agent_nlu_config = Configuration("Agent NLU Configuration") - agent_nlu_config.set_file(self.agent_nlu_config_path) - self._agent_nlu = get_NLU(agent_nlu_config) - return self._user_nlu, self._agent_nlu - - def _get_intent_lists(self, **kwargs: Any) -> Tuple[List[Intent], ...]: - """Builds intent lists from kwargs.""" - rec_labels = kwargs.get( - "recommendation_intent_labels", ["REC-S", "REC-E"] - ) - acc_labels = kwargs.get("acceptance_intent_labels", ["ACC"]) - rej_labels = kwargs.get("rejection_intent_labels", ["REJ"]) - return ( - [Intent(label) for label in rec_labels], - [Intent(label) for label in acc_labels], - [Intent(label) for label in rej_labels], - ) - - def _prepare( - self, dialogue: Dialogue, **kwargs: Any - ) -> Tuple[Dialogue, List[Intent], List[Intent], List[Intent]]: - """Annotates dialogue. - - Returns: - dialogue - rec_intents: Recommendation intents. - acc_intents: Acceptance intents. - rej_intents: Rejection intents. - """ - user_nlu, agent_nlu = self._load_nlus() - self._annotate_dialogue(dialogue, user_nlu, agent_nlu) - rec, acc, rej = self._get_intent_lists(**kwargs) - return dialogue, rec, acc, rej - - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: - """Computes the metric for a single dialogue.""" - raise NotImplementedError() - - -class SuccessRateMetric(UtilityMetricBase): - def __init__( - self, - user_nlu_config_path: str, - agent_nlu_config_path: str, - name: str = "success_rate", - ): - super().__init__(user_nlu_config_path, agent_nlu_config_path, name) - - def _assess_dialogue( - self, - dialogue: Dialogue, - recommendation_intents: List[Intent], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], - ) -> int: - """Returns number of successful recommendation rounds.""" - rounds = self._get_recommendation_rounds( - dialogue, recommendation_intents - ) - return sum( - 1 - for round_utterances in rounds - if self._is_recommendation_accepted( - round_utterances, acceptance_intents, rejection_intents - ) - ) - - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: - dlg, rec, acc, rej = self._prepare(dialogue, **kwargs) - successful_rounds = self._assess_dialogue(dlg, rec, acc, rej) - return float(successful_rounds > 0) - - -class SuccessfulRecommendationRoundRatioMetric(UtilityMetricBase): - def __init__( - self, - user_nlu_config_path: str, - agent_nlu_config_path: str, - name: str = "successful_recommendation_round_ratio", - ): - super().__init__(user_nlu_config_path, agent_nlu_config_path, name) - - def _assess_dialogue( - self, - dialogue: Dialogue, - recommendation_intents: List[Intent], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], - ) -> Tuple[int, int]: - """Returns successful rounds and total rounds.""" - rounds = self._get_recommendation_rounds( - dialogue, recommendation_intents - ) - successful_rounds = sum( - 1 - for round_utterances in rounds - if self._is_recommendation_accepted( - round_utterances, acceptance_intents, rejection_intents - ) - ) - return successful_rounds, len(rounds) - - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: - dlg, rec, acc, rej = self._prepare(dialogue, **kwargs) - successful_rounds, total_rounds = self._assess_dialogue( - dlg, rec, acc, rej - ) - return successful_rounds / total_rounds if total_rounds > 0 else 0.0 - - -class RewardPerDialogueLengthMetric(UtilityMetricBase): - def __init__( - self, - user_nlu_config_path: str, - agent_nlu_config_path: str, - name: str = "reward_per_dialogue_length", - ): - super().__init__(user_nlu_config_path, agent_nlu_config_path, name) - - def _assess_dialogue( - self, dialogue: Dialogue, acceptance_intents: List[Intent] - ) -> Tuple[int, int]: - """Returns accepted recommendations and dialogue length.""" - nb_accepted_recommendations = sum( - 1 - for utterance in dialogue.utterances - if utterance.participant == DialogueParticipant.USER - and any( - intent in acceptance_intents - for intent in utterance.get_intents() - ) - ) - return nb_accepted_recommendations, len(dialogue.utterances) - - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: - dlg, _, acc, _ = self._prepare(dialogue, **kwargs) - nb_accepted_recommendations, dialogue_length = self._assess_dialogue( - dlg, acc - ) - return nb_accepted_recommendations / dialogue_length From 5e1b3a6c4d25e0530f5cc13b44d10bcf63efbed6 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 10 Mar 2026 13:47:38 +0100 Subject: [PATCH 22/38] fixes --- .../evaluation/reward_per_dialogue_length_metric.py | 8 +------- usersimcrs/evaluation/success_rate_metric.py | 9 +-------- .../successful_recommendation_round_ratio_metric.py | 8 +------- 3 files changed, 3 insertions(+), 22 deletions(-) diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py index c094e7a7..463a5508 100644 --- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py +++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py @@ -1,4 +1,4 @@ -"""Reward-per-Dialogue-Length (RDL) metric implementation. +"""Reward-per-Dialogue-Length metric implementation. Evaluates the ratio of accepted recommendations to total dialogue length. """ @@ -15,12 +15,6 @@ class RewardPerDialogueLengthMetric(BaseMetric): - """Measures accepted recommendations relative to dialogue length. - - Returns the number of accepted recommendations divided by the total number - of utterances. - """ - def __init__( self, user_nlu_config_path: str, diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py index 3c689c71..478a7afa 100644 --- a/usersimcrs/evaluation/success_rate_metric.py +++ b/usersimcrs/evaluation/success_rate_metric.py @@ -1,4 +1,4 @@ -"""Success Rate (SR) metric implementation. +"""Success Rate metric implementation. Evaluates whether at least one recommendation was accepted during a dialogue. """ @@ -18,13 +18,6 @@ class SuccessRateMetric(BaseMetric): - """Measures whether a dialogue contains at least one accepted - recommendation. - - Returns 1.0 if at least one recommendation round was successful, - 0.0 otherwise. - """ - def __init__( self, user_nlu_config_path: str, diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py index b8de7013..9d2156c0 100644 --- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py +++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py @@ -1,4 +1,4 @@ -"""Successful Recommendation Round Ratio (SRRR) metric implementation. +"""Successful Recommendation Round Ratio metric implementation. Evaluates the ratio of accepted recommendation rounds to total recommendation rounds in a dialogue. @@ -19,12 +19,6 @@ class SuccessfulRecommendationRoundRatioMetric(BaseMetric): - """Measures the fraction of recommendation rounds that were accepted. - - Returns a value between 0.0 and 1.0 (or 0.0 when there are no recommendation - rounds). - """ - def __init__( self, user_nlu_config_path: str, From 558b5885907908b8957df473d38e944f17583075 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 10 Mar 2026 15:09:58 +0100 Subject: [PATCH 23/38] made fixes --- tests/evaluation/test_utility_metric.py | 4 +- usersimcrs/evaluation/dialogue_annotation.py | 21 ++++++++ .../reward_per_dialogue_length_metric.py | 46 ++++++++++------ usersimcrs/evaluation/success_rate_metric.py | 52 ++++++++++++------- ...ssful_recommendation_round_ratio_metric.py | 38 ++++++++++---- 5 files changed, 114 insertions(+), 47 deletions(-) diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py index 4862ba6c..e15f7bc3 100644 --- a/tests/evaluation/test_utility_metric.py +++ b/tests/evaluation/test_utility_metric.py @@ -61,7 +61,7 @@ def test_success_rate_evaluate_dialogue( "usersimcrs.evaluation.success_rate_metric.prepare_dialogue", return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU), ), - patch.object(SuccessRateMetric, "_assess_dialogue", return_value=1), + patch.object(SuccessRateMetric, "_assess_dialogue", return_value=True), ): assert success_rate_metric.evaluate_dialogue(dialogue) == 1.0 @@ -76,7 +76,7 @@ def test_success_rate_evaluate_dialogue_unsuccessful( "usersimcrs.evaluation.success_rate_metric.prepare_dialogue", return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU), ), - patch.object(SuccessRateMetric, "_assess_dialogue", return_value=0), + patch.object(SuccessRateMetric, "_assess_dialogue", return_value=False), ): assert success_rate_metric.evaluate_dialogue(dialogue) == 0.0 diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py index 06bc2572..e431beab 100644 --- a/usersimcrs/evaluation/dialogue_annotation.py +++ b/usersimcrs/evaluation/dialogue_annotation.py @@ -118,6 +118,27 @@ def get_intent_lists( ) +def annotate_dialogues( + dialogues: List[Dialogue], + user_nlu_config_path: str, + agent_nlu_config_path: str, +) -> List[Dialogue]: + """Annotates a batch of dialogues, loading NLU modules once. + + Args: + dialogues: Dialogues to annotate. + user_nlu_config_path: Path to user NLU configuration file. + agent_nlu_config_path: Path to agent NLU configuration file. + + Returns: + The same dialogue objects with annotated utterances. + """ + user_nlu, agent_nlu = load_nlus(user_nlu_config_path, agent_nlu_config_path) + for dialogue in dialogues: + annotate_dialogue(dialogue, user_nlu, agent_nlu) + return dialogues + + def get_recommendation_rounds( dialogue: Dialogue, recommendation_intents: List[Intent] ) -> List[List[AnnotatedUtterance]]: diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py index 463a5508..5d6eab26 100644 --- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py +++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py @@ -11,18 +11,25 @@ from dialoguekit.participant.participant import DialogueParticipant from usersimcrs.evaluation.base_metric import BaseMetric -from usersimcrs.evaluation.dialogue_annotation import prepare_dialogue +from usersimcrs.evaluation.dialogue_annotation import ( + get_intent_lists, + prepare_dialogue, +) class RewardPerDialogueLengthMetric(BaseMetric): def __init__( self, - user_nlu_config_path: str, - agent_nlu_config_path: str, + user_nlu_config_path: Optional[str] = None, + agent_nlu_config_path: Optional[str] = None, name: str = "reward_per_dialogue_length", ) -> None: """Initializes the reward-per-dialogue-length metric. + When NLU config paths are provided, dialogues are annotated + automatically. When omitted, dialogues must be pre-annotated + (e.g., via :func:`annotate_dialogues`). + Args: user_nlu_config_path: Path to user NLU configuration. agent_nlu_config_path: Path to agent NLU configuration. @@ -67,15 +74,24 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: Returns: Ratio of accepted recommendations to total utterances. """ - dlg, _, acc, _, self._user_nlu, self._agent_nlu = prepare_dialogue( - dialogue, - self._user_nlu_config_path, - self._agent_nlu_config_path, - self._user_nlu, - self._agent_nlu, - **kwargs, - ) - nb_accepted_recommendations, dialogue_length = self._assess_dialogue( - dlg, acc - ) - return nb_accepted_recommendations / dialogue_length + if self._user_nlu_config_path and self._agent_nlu_config_path: + ( + dialogue, + _, + acc, + _, + self._user_nlu, + self._agent_nlu, + ) = prepare_dialogue( + dialogue, + self._user_nlu_config_path, + self._agent_nlu_config_path, + self._user_nlu, + self._agent_nlu, + **kwargs, + ) + else: + _, acc, _ = get_intent_lists(**kwargs) + + nb_accepted, dialogue_length = self._assess_dialogue(dialogue, acc) + return nb_accepted / dialogue_length diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py index 478a7afa..cc9c192c 100644 --- a/usersimcrs/evaluation/success_rate_metric.py +++ b/usersimcrs/evaluation/success_rate_metric.py @@ -11,6 +11,7 @@ from usersimcrs.evaluation.base_metric import BaseMetric from usersimcrs.evaluation.dialogue_annotation import ( + get_intent_lists, get_recommendation_rounds, is_recommendation_accepted, prepare_dialogue, @@ -20,12 +21,16 @@ class SuccessRateMetric(BaseMetric): def __init__( self, - user_nlu_config_path: str, - agent_nlu_config_path: str, + user_nlu_config_path: Optional[str] = None, + agent_nlu_config_path: Optional[str] = None, name: str = "success_rate", ) -> None: """Initializes the success rate metric. + When NLU config paths are provided, dialogues are annotated + automatically. When omitted, dialogues must be pre-annotated + (e.g., via :func:`annotate_dialogues`). + Args: user_nlu_config_path: Path to user NLU configuration. agent_nlu_config_path: Path to agent NLU configuration. @@ -43,8 +48,8 @@ def _assess_dialogue( recommendation_intents: List[Intent], acceptance_intents: List[Intent], rejection_intents: List[Intent], - ) -> int: - """Returns number of successful recommendation rounds. + ) -> bool: + """Checks whether at least one recommendation round was accepted. Args: dialogue: Annotated dialogue. @@ -53,15 +58,14 @@ def _assess_dialogue( rejection_intents: Intents that signal rejection. Returns: - Number of recommendation rounds that were accepted. + True if at least one round was accepted, False otherwise. """ rounds = get_recommendation_rounds(dialogue, recommendation_intents) - return sum( - 1 - for round_utterances in rounds - if is_recommendation_accepted( + return any( + is_recommendation_accepted( round_utterances, acceptance_intents, rejection_intents ) + for round_utterances in rounds ) def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: @@ -74,13 +78,23 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: Returns: 1.0 if at least one recommendation was accepted, 0.0 otherwise. """ - dlg, rec, acc, rej, self._user_nlu, self._agent_nlu = prepare_dialogue( - dialogue, - self._user_nlu_config_path, - self._agent_nlu_config_path, - self._user_nlu, - self._agent_nlu, - **kwargs, - ) - successful_rounds = self._assess_dialogue(dlg, rec, acc, rej) - return float(successful_rounds > 0) + if self._user_nlu_config_path and self._agent_nlu_config_path: + ( + dialogue, + rec, + acc, + rej, + self._user_nlu, + self._agent_nlu, + ) = prepare_dialogue( + dialogue, + self._user_nlu_config_path, + self._agent_nlu_config_path, + self._user_nlu, + self._agent_nlu, + **kwargs, + ) + else: + rec, acc, rej = get_intent_lists(**kwargs) + + return float(self._assess_dialogue(dialogue, rec, acc, rej)) diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py index 9d2156c0..00fa5c0b 100644 --- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py +++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py @@ -12,6 +12,7 @@ from usersimcrs.evaluation.base_metric import BaseMetric from usersimcrs.evaluation.dialogue_annotation import ( + get_intent_lists, get_recommendation_rounds, is_recommendation_accepted, prepare_dialogue, @@ -21,12 +22,16 @@ class SuccessfulRecommendationRoundRatioMetric(BaseMetric): def __init__( self, - user_nlu_config_path: str, - agent_nlu_config_path: str, + user_nlu_config_path: Optional[str] = None, + agent_nlu_config_path: Optional[str] = None, name: str = "successful_recommendation_round_ratio", ) -> None: """Initializes the successful recommendation round ratio metric. + When NLU config paths are provided, dialogues are annotated + automatically. When omitted, dialogues must be pre-annotated + (e.g., via :func:`annotate_dialogues`). + Args: user_nlu_config_path: Path to user NLU configuration. agent_nlu_config_path: Path to agent NLU configuration. @@ -77,15 +82,26 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: Ratio of accepted recommendation rounds to total rounds, or 0.0 if there are no recommendation rounds. """ - dlg, rec, acc, rej, self._user_nlu, self._agent_nlu = prepare_dialogue( - dialogue, - self._user_nlu_config_path, - self._agent_nlu_config_path, - self._user_nlu, - self._agent_nlu, - **kwargs, - ) + if self._user_nlu_config_path and self._agent_nlu_config_path: + ( + dialogue, + rec, + acc, + rej, + self._user_nlu, + self._agent_nlu, + ) = prepare_dialogue( + dialogue, + self._user_nlu_config_path, + self._agent_nlu_config_path, + self._user_nlu, + self._agent_nlu, + **kwargs, + ) + else: + rec, acc, rej = get_intent_lists(**kwargs) + successful_rounds, total_rounds = self._assess_dialogue( - dlg, rec, acc, rej + dialogue, rec, acc, rej ) return successful_rounds / total_rounds if total_rounds > 0 else 0.0 From 9146421211be4931af36588230e55c38c644051d Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 10 Mar 2026 15:57:18 +0100 Subject: [PATCH 24/38] remove class from other pr --- usersimcrs/evaluation/__init__.py | 5 --- usersimcrs/evaluation/base_metric.py | 48 ---------------------------- 2 files changed, 53 deletions(-) delete mode 100644 usersimcrs/evaluation/__init__.py delete mode 100644 usersimcrs/evaluation/base_metric.py diff --git a/usersimcrs/evaluation/__init__.py b/usersimcrs/evaluation/__init__.py deleted file mode 100644 index c55a4339..00000000 --- a/usersimcrs/evaluation/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Evaluation metrics for dialogue systems.""" - -from usersimcrs.evaluation.base_metric import BaseMetric - -__all__ = ["BaseMetric"] diff --git a/usersimcrs/evaluation/base_metric.py b/usersimcrs/evaluation/base_metric.py deleted file mode 100644 index c99399a2..00000000 --- a/usersimcrs/evaluation/base_metric.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Abstract base class for dialogue evaluation metrics.""" - -from abc import ABC, abstractmethod -from typing import Any, Dict, List -from dialoguekit.core.dialogue import Dialogue - - -class BaseMetric(ABC): - def __init__(self, name: str) -> None: - """Initializes the metric. - - Args: - name: Metric name. - """ - self.name = name - - @abstractmethod - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: - """Computes the metric for a single dialogue. - - Args: - dialogue: Single dialogue to score. - **kwargs: Additional arguments specific to the metric. - - Raises: - NotImplementedError: When not implemented by a subclass. - - Returns: - Score for the dialogue. - """ - raise NotImplementedError() - - def evaluate_dialogues( - self, dialogues: List[Dialogue], **kwargs: Any - ) -> Dict[str, float]: - """Computes the metric for every dialogue in a given list. - - Args: - dialogues: Dialogues. - **kwargs: Additional arguments specific to the metric. - - Returns: - Dictionary with result per dialogue. Keys are conversation IDs. - """ - return { - dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs) - for dialogue in dialogues - } From fe8ea9c30343a29c55558df6d62c13619b2bd288 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 10 Mar 2026 16:23:33 +0100 Subject: [PATCH 25/38] changes after new structure --- usersimcrs/evaluation/main.py | 58 +++++++++++++++++------------------ 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/usersimcrs/evaluation/main.py b/usersimcrs/evaluation/main.py index 3696b1f9..fa1cf048 100644 --- a/usersimcrs/evaluation/main.py +++ b/usersimcrs/evaluation/main.py @@ -5,7 +5,7 @@ import os from collections import defaultdict from statistics import mean, stdev -from typing import Dict, List, Mapping, Sequence +from typing import Any, Dict, List, Mapping, Sequence from dialoguekit.core.dialogue import Dialogue from dialoguekit.nlu.models.satisfaction_classifier import ( @@ -14,16 +14,25 @@ from dialoguekit.utils.dialogue_reader import json_to_dialogues from usersimcrs.evaluation.base_metric import BaseMetric +from usersimcrs.evaluation.dialogue_annotation import annotate_dialogues from usersimcrs.evaluation.quality_metric import QualityMetric from usersimcrs.evaluation.quality_rubrics import QualityRubrics from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric -from usersimcrs.evaluation.utility_metric import ( +from usersimcrs.evaluation.reward_per_dialogue_length_metric import ( RewardPerDialogueLengthMetric, - SuccessRateMetric, +) +from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric +from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import ( SuccessfulRecommendationRoundRatioMetric, ) from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface +UTILITY_METRICS = { + "success_rate", + "successful_recommendation_round_ratio", + "reward_per_dialogue_length", +} + SUPPORTED_METRICS = [ "quality", "satisfaction", @@ -115,27 +124,23 @@ def _validate_args(args: argparse.Namespace) -> None: "The --ollama_config argument is required when using quality." ) - utility_metrics = { - "success_rate", - "successful_recommendation_round_ratio", - "reward_per_dialogue_length", - } - if utility_metrics.intersection(set(args.metrics)): + if UTILITY_METRICS.intersection(set(args.metrics)): if not args.user_nlu_config or not args.agent_nlu_config: raise ValueError( "Both --user_nlu_config and --agent_nlu_config are required " "for utility metrics." ) + supported_aspect_names = [aspect.name for aspect in QualityRubrics] invalid_aspects = [ aspect for aspect in args.quality_aspects - if aspect not in [enum_aspect.name for enum_aspect in QualityRubrics] + if aspect not in supported_aspect_names ] if invalid_aspects: raise ValueError( f"Unknown quality aspect(s): {invalid_aspects}. " - f"Supported aspects: {[aspect.name for aspect in QualityRubrics]}" + f"Supported aspects: {supported_aspect_names}" ) @@ -153,22 +158,13 @@ def _build_metric_registry(args: argparse.Namespace) -> Dict[str, BaseMetric]: classifier=SatisfactionClassifierSVM() ) if "success_rate" in args.metrics: - registry["success_rate"] = SuccessRateMetric( - user_nlu_config_path=args.user_nlu_config, - agent_nlu_config_path=args.agent_nlu_config, - ) + registry["success_rate"] = SuccessRateMetric() if "successful_recommendation_round_ratio" in args.metrics: registry[ "successful_recommendation_round_ratio" - ] = SuccessfulRecommendationRoundRatioMetric( - user_nlu_config_path=args.user_nlu_config, - agent_nlu_config_path=args.agent_nlu_config, - ) + ] = SuccessfulRecommendationRoundRatioMetric() if "reward_per_dialogue_length" in args.metrics: - registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric( - user_nlu_config_path=args.user_nlu_config, - agent_nlu_config_path=args.agent_nlu_config, - ) + registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric() return registry @@ -204,7 +200,7 @@ def _evaluate_metric( ) -> Dict[str, object]: """Runs one metric and returns per-dialogue scores and summary.""" if metric_name == "quality": - per_aspect: Dict[str, Dict[str, Dict[str, float]]] = {} + per_aspect: Dict[str, Dict[str, Any]] = {} for aspect in args.quality_aspects: per_dialogue = metric.evaluate_dialogues( list(dialogues), @@ -219,11 +215,7 @@ def _evaluate_metric( return {"aspects": per_aspect} eval_kwargs = {} - if metric_name in { - "success_rate", - "successful_recommendation_round_ratio", - "reward_per_dialogue_length", - }: + if metric_name in UTILITY_METRICS: eval_kwargs = { "recommendation_intent_labels": args.recommendation_intent_labels, "acceptance_intent_labels": args.accept_intent_labels, @@ -271,9 +263,15 @@ def main() -> None: _validate_args(args) dialogues = json_to_dialogues(args.dialogues) + + if UTILITY_METRICS.intersection(set(args.metrics)): + annotate_dialogues( + dialogues, args.user_nlu_config, args.agent_nlu_config + ) + metric_registry = _build_metric_registry(args) - results: Dict[str, object] = { + results: Dict[str, Any] = { "dialogues_path": args.dialogues, "metrics_requested": args.metrics, "metrics": {}, From d749156c0a225b039452792698f47ef53d6e475a Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 10 Mar 2026 16:25:07 +0100 Subject: [PATCH 26/38] remove main --- usersimcrs/evaluation/main.py | 303 ---------------------------------- 1 file changed, 303 deletions(-) delete mode 100644 usersimcrs/evaluation/main.py diff --git a/usersimcrs/evaluation/main.py b/usersimcrs/evaluation/main.py deleted file mode 100644 index 3372093e..00000000 --- a/usersimcrs/evaluation/main.py +++ /dev/null @@ -1,303 +0,0 @@ -"""Unified script for evaluating dialogues with selected metrics.""" - -import argparse -import json -import os -from collections import defaultdict -from statistics import mean, stdev -from typing import Dict, List, Mapping, Sequence - -from dialoguekit.core.dialogue import Dialogue -from dialoguekit.nlu.models.satisfaction_classifier import ( - SatisfactionClassifierSVM, -) -from dialoguekit.utils.dialogue_reader import json_to_dialogues - -from usersimcrs.evaluation.base_metric import BaseMetric -from usersimcrs.evaluation.quality_metric import QualityMetric -from usersimcrs.evaluation.quality_rubrics import QualityRubrics -from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric -from usersimcrs.evaluation.reward_per_dialogue_length_metric import ( - RewardPerDialogueLengthMetric, -) -from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric -from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import ( - SuccessfulRecommendationRoundRatioMetric, -) -from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface - -SUPPORTED_METRICS = [ - "quality", - "satisfaction", - "success_rate", - "successful_recommendation_round_ratio", - "reward_per_dialogue_length", -] - - -def parse_args() -> argparse.Namespace: - """Parses command-line arguments.""" - parser = argparse.ArgumentParser(prog="usersimcrs.evaluation.main") - parser.add_argument( - "--dialogues", - type=str, - required=True, - help="Path to the dialogues JSON file.", - ) - parser.add_argument( - "--metrics", - nargs="+", - required=True, - choices=SUPPORTED_METRICS, - help="List of metrics to compute.", - ) - parser.add_argument( - "--output", - type=str, - required=True, - help="Path to save evaluation results as JSON.", - ) - parser.add_argument( - "--ollama_config", - type=str, - help="Path to Ollama config file (required when quality is selected).", - ) - parser.add_argument( - "--quality_aspects", - nargs="+", - default=[aspect.name for aspect in QualityRubrics], - help=( - "Quality aspects to evaluate. " - "Defaults to all aspects in QualityRubrics." - ), - ) - parser.add_argument( - "--user_nlu_config", - type=str, - help=( - "Path to user NLU config (required for utility metrics: " - "success_rate, successful_recommendation_round_ratio, " - "reward_per_dialogue_length)." - ), - ) - parser.add_argument( - "--agent_nlu_config", - type=str, - help=( - "Path to agent NLU config (required for utility metrics: " - "success_rate, successful_recommendation_round_ratio, " - "reward_per_dialogue_length)." - ), - ) - parser.add_argument( - "--reject_intent_labels", - nargs="+", - default=["REJ"], - help="Intent labels corresponding to rejection.", - ) - parser.add_argument( - "--accept_intent_labels", - nargs="+", - default=["ACC"], - help="Intent labels corresponding to acceptance.", - ) - parser.add_argument( - "--recommendation_intent_labels", - nargs="+", - default=["REC-S", "REC-E"], - help="Intent labels corresponding to recommendation.", - ) - return parser.parse_args() - - -def _validate_args(args: argparse.Namespace) -> None: - """Validates metric-specific CLI requirements.""" - if "quality" in args.metrics and not args.ollama_config: - raise ValueError( - "The --ollama_config argument is required when using quality." - ) - - utility_metrics = { - "success_rate", - "successful_recommendation_round_ratio", - "reward_per_dialogue_length", - } - if utility_metrics.intersection(set(args.metrics)): - if not args.user_nlu_config or not args.agent_nlu_config: - raise ValueError( - "Both --user_nlu_config and --agent_nlu_config are required " - "for utility metrics." - ) - - invalid_aspects = [ - aspect - for aspect in args.quality_aspects - if aspect not in [enum_aspect.name for enum_aspect in QualityRubrics] - ] - if invalid_aspects: - raise ValueError( - f"Unknown quality aspect(s): {invalid_aspects}. " - f"Supported aspects: {[aspect.name for aspect in QualityRubrics]}" - ) - - -def _build_metric_registry(args: argparse.Namespace) -> Dict[str, BaseMetric]: - """Builds metric instances keyed by metric name.""" - registry: Dict[str, BaseMetric] = {} - if "quality" in args.metrics: - llm_interface = OllamaLLMInterface( - configuration_path=args.ollama_config, - default_response="", - ) - registry["quality"] = QualityMetric(llm_interface=llm_interface) - if "satisfaction" in args.metrics: - registry["satisfaction"] = SatisfactionMetric( - classifier=SatisfactionClassifierSVM() - ) - if "success_rate" in args.metrics: - registry["success_rate"] = SuccessRateMetric( - user_nlu_config_path=args.user_nlu_config, - agent_nlu_config_path=args.agent_nlu_config, - ) - if "successful_recommendation_round_ratio" in args.metrics: - registry[ - "successful_recommendation_round_ratio" - ] = SuccessfulRecommendationRoundRatioMetric( - user_nlu_config_path=args.user_nlu_config, - agent_nlu_config_path=args.agent_nlu_config, - ) - if "reward_per_dialogue_length" in args.metrics: - registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric( - user_nlu_config_path=args.user_nlu_config, - agent_nlu_config_path=args.agent_nlu_config, - ) - return registry - - -def _summarize_by_agent( - dialogues: Sequence[Dialogue], scores: Mapping[str, float] -) -> Dict[str, Dict[str, float]]: - """Returns aggregate statistics by agent.""" - conversation_to_agent = { - dialogue.conversation_id: dialogue.agent_id for dialogue in dialogues - } - grouped_scores: Dict[str, List[float]] = defaultdict(list) - for conversation_id, score in scores.items(): - agent_id = conversation_to_agent.get(conversation_id, "unknown") - grouped_scores[agent_id].append(score) - - summary: Dict[str, Dict[str, float]] = {} - for agent_id, agent_scores in grouped_scores.items(): - summary[agent_id] = { - "count": float(len(agent_scores)), - "min": min(agent_scores), - "max": max(agent_scores), - "mean": mean(agent_scores), - "stdev": stdev(agent_scores) if len(agent_scores) > 1 else 0.0, - } - return summary - - -def _evaluate_metric( - metric_name: str, - metric: BaseMetric, - dialogues: Sequence[Dialogue], - args: argparse.Namespace, -) -> Dict[str, object]: - """Runs one metric and returns per-dialogue scores and summary.""" - if metric_name == "quality": - per_aspect: Dict[str, Dict[str, Dict[str, float]]] = {} - for aspect in args.quality_aspects: - per_dialogue = metric.evaluate_dialogues( - list(dialogues), - aspect=aspect, - ) - per_aspect[aspect] = { - "per_dialogue": per_dialogue, - "summary_by_agent": _summarize_by_agent( - dialogues, per_dialogue - ), - } - return {"aspects": per_aspect} - - eval_kwargs = {} - if metric_name in { - "success_rate", - "successful_recommendation_round_ratio", - "reward_per_dialogue_length", - }: - eval_kwargs = { - "recommendation_intent_labels": args.recommendation_intent_labels, - "acceptance_intent_labels": args.accept_intent_labels, - "rejection_intent_labels": args.reject_intent_labels, - } - - per_dialogue_scores = metric.evaluate_dialogues( - list(dialogues), **eval_kwargs - ) - return { - "per_dialogue": per_dialogue_scores, - "summary_by_agent": _summarize_by_agent(dialogues, per_dialogue_scores), - } - - -def _print_brief_summary(results: Mapping[str, object]) -> None: - """Prints a concise summary in the terminal.""" - metric_results = results.get("metrics", {}) - if not isinstance(metric_results, dict): - return - for metric_name, metric_result in metric_results.items(): - print(f"Metric: {metric_name}") - if metric_name == "quality": - aspects = metric_result.get("aspects", {}) - for aspect_name, aspect_result in aspects.items(): - print(f" Aspect: {aspect_name}") - for agent_id, stats in aspect_result[ - "summary_by_agent" - ].items(): - print( - f" Agent: {agent_id} | mean={stats['mean']:.3f} " - f"stdev={stats['stdev']:.3f}" - ) - continue - - for agent_id, stats in metric_result["summary_by_agent"].items(): - print( - f" Agent: {agent_id} | mean={stats['mean']:.3f} " - f"stdev={stats['stdev']:.3f}" - ) - - -def main() -> None: - args = parse_args() - _validate_args(args) - - dialogues = json_to_dialogues(args.dialogues) - metric_registry = _build_metric_registry(args) - - results: Dict[str, object] = { - "dialogues_path": args.dialogues, - "metrics_requested": args.metrics, - "metrics": {}, - } - - for metric_name in args.metrics: - metric = metric_registry[metric_name] - results["metrics"][metric_name] = _evaluate_metric( - metric_name, - metric, - dialogues, - args, - ) - - output_dir = os.path.dirname(args.output) - if output_dir: - os.makedirs(output_dir, exist_ok=True) - with open(args.output, "w") as f: - json.dump(results, f, indent=2) - - _print_brief_summary(results) - - -if __name__ == "__main__": - main() From 8f333e74d01184f83c3999da985d6a82248bc415 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 10 Mar 2026 17:10:42 +0100 Subject: [PATCH 27/38] changes --- usersimcrs/evaluation/dialogue_annotation.py | 57 +++++++++---------- usersimcrs/evaluation/quality_metric.py | 2 +- .../reward_per_dialogue_length_metric.py | 14 ++++- usersimcrs/evaluation/success_rate_metric.py | 19 ++++++- ...ssful_recommendation_round_ratio_metric.py | 15 ++++- usersimcrs/evaluation/utility_base_metric.py | 5 +- 6 files changed, 68 insertions(+), 44 deletions(-) diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py index f0d75fea..31c06018 100644 --- a/usersimcrs/evaluation/dialogue_annotation.py +++ b/usersimcrs/evaluation/dialogue_annotation.py @@ -24,7 +24,7 @@ def annotate_dialogue( """Annotates utterances with dialogue acts. Each utterance that is not already an AnnotatedUtterance is converted to - one. Utterances that already carry dialogue acts are left untouched. + one. Utterances that already carry dialogue acts are left untouched. Args: dialogue: Dialogue to be annotated. @@ -59,35 +59,29 @@ def annotate_dialogue( return dialogue -def load_nlus( - user_nlu_config_path: str, - agent_nlu_config_path: str, - cached_user_nlu: Optional[NLU] = None, - cached_agent_nlu: Optional[NLU] = None, -) -> Tuple[NLU, NLU]: - """Loads user and agent NLU modules. +def load_nlu( + nlu_config_path: str, + config_name: str = "NLU Configuration", + cached_nlu: Optional[NLU] = None, +) -> NLU: + """Loads a single NLU module. - Returns cached instances when provided, otherwise creates new ones - from the given configuration files. + Returns the cached instance when provided, otherwise creates a new one + from the given configuration file. Args: - user_nlu_config_path: Path to user NLU configuration file. - agent_nlu_config_path: Path to agent NLU configuration file. - cached_user_nlu: Previously loaded user NLU module. - cached_agent_nlu: Previously loaded agent NLU module. + nlu_config_path: Path to the NLU configuration file. + config_name: Name for the Configuration instance. + cached_nlu: Previously loaded NLU module. Returns: - Tuple of (user_nlu, agent_nlu) modules. + NLU module. """ - if cached_user_nlu is None: - user_nlu_config = Configuration("User NLU Configuration") - user_nlu_config.set_file(user_nlu_config_path) - cached_user_nlu = get_NLU(user_nlu_config) - if cached_agent_nlu is None: - agent_nlu_config = Configuration("Agent NLU Configuration") - agent_nlu_config.set_file(agent_nlu_config_path) - cached_agent_nlu = get_NLU(agent_nlu_config) - return cached_user_nlu, cached_agent_nlu + if cached_nlu is not None: + return cached_nlu + nlu_config = Configuration(config_name) + nlu_config.set_file(nlu_config_path) + return get_NLU(nlu_config) def get_intent_lists( @@ -130,7 +124,8 @@ def annotate_dialogues( user_nlu_config_path: Path to user NLU configuration file. agent_nlu_config_path: Path to agent NLU configuration file. """ - user_nlu, agent_nlu = load_nlus(user_nlu_config_path, agent_nlu_config_path) + user_nlu = load_nlu(user_nlu_config_path, "User NLU Configuration") + agent_nlu = load_nlu(agent_nlu_config_path, "Agent NLU Configuration") for dialogue in dialogues: annotate_dialogue(dialogue, user_nlu, agent_nlu) @@ -175,7 +170,7 @@ def prepare_dialogue( ) -> Tuple[Dialogue, List[Intent], List[Intent], List[Intent], NLU, NLU]: """Loads NLU modules, annotates a dialogue, and builds intent lists. - Combines :func:`load_nlus`, :func:`annotate_dialogue`, and + Combines :func:`load_nlu`, :func:`annotate_dialogue`, and :func:`get_intent_lists` into a single convenience call. Args: @@ -191,11 +186,11 @@ def prepare_dialogue( Tuple of (annotated dialogue, recommendation intents, acceptance intents, rejection intents, user NLU, agent NLU). """ - user_nlu, agent_nlu = load_nlus( - user_nlu_config_path, - agent_nlu_config_path, - cached_user_nlu, - cached_agent_nlu, + user_nlu = load_nlu( + user_nlu_config_path, "User NLU Configuration", cached_user_nlu + ) + agent_nlu = load_nlu( + agent_nlu_config_path, "Agent NLU Configuration", cached_agent_nlu ) annotate_dialogue(dialogue, user_nlu, agent_nlu) rec, acc, rej = get_intent_lists(**kwargs) diff --git a/usersimcrs/evaluation/quality_metric.py b/usersimcrs/evaluation/quality_metric.py index 3198c77b..acbfc595 100644 --- a/usersimcrs/evaluation/quality_metric.py +++ b/usersimcrs/evaluation/quality_metric.py @@ -46,7 +46,7 @@ def __init__( Args: llm_interface: LLM interface used for scoring. - name: Metric name. + name: Metric name. Defaults to "quality". """ super().__init__(name) self.llm_interface = llm_interface diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py index a630b7cd..1727cc32 100644 --- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py +++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py @@ -60,11 +60,19 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: Args: dialogue: Dialogue to evaluate. - **kwargs: Optional intent label overrides. + **kwargs: Optional intent label overrides: + - recommendation_intent_labels: Labels for recommendation + intents. Defaults to ["REC-S", "REC-E"]. + - acceptance_intent_labels: Labels for acceptance intents. + Defaults to ["ACC"]. + - rejection_intent_labels: Labels for rejection intents. + Defaults to ["REJ"]. Returns: Ratio of accepted recommendations to total utterances. """ - _, acc, _ = self._resolve_intents(dialogue, **kwargs) - nb_accepted, dialogue_length = self._assess_dialogue(dialogue, acc) + _, acc, _ = self._resolve_intents(dialogue=dialogue, **kwargs) + nb_accepted, dialogue_length = self._assess_dialogue( + dialogue=dialogue, acceptance_intents=acc + ) return nb_accepted / dialogue_length diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py index c4735111..e0bb9566 100644 --- a/usersimcrs/evaluation/success_rate_metric.py +++ b/usersimcrs/evaluation/success_rate_metric.py @@ -66,10 +66,23 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: Args: dialogue: Dialogue to evaluate. - **kwargs: Optional intent label overrides. + **kwargs: Optional intent label overrides: + - recommendation_intent_labels: Labels for recommendation + intents. Defaults to ["REC-S", "REC-E"]. + - acceptance_intent_labels: Labels for acceptance intents. + Defaults to ["ACC"]. + - rejection_intent_labels: Labels for rejection intents. + Defaults to ["REJ"]. Returns: 1.0 if at least one recommendation was accepted, 0.0 otherwise. """ - rec, acc, rej = self._resolve_intents(dialogue, **kwargs) - return float(self._assess_dialogue(dialogue, rec, acc, rej)) + rec, acc, rej = self._resolve_intents(dialogue=dialogue, **kwargs) + return float( + self._assess_dialogue( + dialogue=dialogue, + recommendation_intents=rec, + acceptance_intents=acc, + rejection_intents=rej, + ) + ) diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py index dbd6c9f6..8f79a16b 100644 --- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py +++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py @@ -69,14 +69,23 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: Args: dialogue: Dialogue to evaluate. - **kwargs: Optional intent label overrides. + **kwargs: Optional intent label overrides: + - recommendation_intent_labels: Labels for recommendation + intents. Defaults to ``["REC-S", "REC-E"]``. + - acceptance_intent_labels: Labels for acceptance intents. + Defaults to ``["ACC"]``. + - rejection_intent_labels: Labels for rejection intents. + Defaults to ``["REJ"]``. Returns: Ratio of accepted recommendation rounds to total rounds, or 0.0 if there are no recommendation rounds. """ - rec, acc, rej = self._resolve_intents(dialogue, **kwargs) + rec, acc, rej = self._resolve_intents(dialogue=dialogue, **kwargs) successful_rounds, total_rounds = self._assess_dialogue( - dialogue, rec, acc, rej + dialogue=dialogue, + recommendation_intents=rec, + acceptance_intents=acc, + rejection_intents=rej, ) return successful_rounds / total_rounds if total_rounds > 0 else 0.0 diff --git a/usersimcrs/evaluation/utility_base_metric.py b/usersimcrs/evaluation/utility_base_metric.py index f4ca09e2..17618b65 100644 --- a/usersimcrs/evaluation/utility_base_metric.py +++ b/usersimcrs/evaluation/utility_base_metric.py @@ -1,4 +1,4 @@ -"""Base class for utility metrics that require NLU annotation.""" +"""Base class for dialogue annotation support.""" from abc import ABC from typing import Any, List, Optional, Tuple @@ -18,8 +18,7 @@ class UtilityBaseMetric(BaseMetric, ABC): """Shared base for metrics that optionally annotate dialogues via NLU. When NLU config paths are provided, dialogues are annotated automatically. - When omitted, dialogues must be pre-annotated (e.g., via - :func:`annotate_dialogues`). + When omitted, dialogues must be pre-annotated. """ def __init__( From 6e6e8b2b197b88b63be17efddf344383ea01a96d Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 10 Mar 2026 17:23:35 +0100 Subject: [PATCH 28/38] changes --- tests/evaluation/__init__.py | 0 tests/evaluation/test_quality_metric.py | 115 +++++++++++++++++++ tests/evaluation/test_satisfaction_metric.py | 80 +++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 tests/evaluation/__init__.py create mode 100644 tests/evaluation/test_quality_metric.py create mode 100644 tests/evaluation/test_satisfaction_metric.py diff --git a/tests/evaluation/__init__.py b/tests/evaluation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/evaluation/test_quality_metric.py b/tests/evaluation/test_quality_metric.py new file mode 100644 index 00000000..3fdf40ec --- /dev/null +++ b/tests/evaluation/test_quality_metric.py @@ -0,0 +1,115 @@ +"""Tests for QualityMetric.""" + +from unittest.mock import MagicMock + +import pytest + +from dialoguekit.utils.dialogue_reader import json_to_dialogues + +from usersimcrs.evaluation.quality_metric import QualityMetric +from usersimcrs.evaluation.quality_rubrics import QualityRubrics +from usersimcrs.llm_interfaces.llm_interface import LLMInterface + + +@pytest.fixture +def dialogues(): + """Load test dialogues.""" + return json_to_dialogues( + "tests/data/annotated_dialogues.json", + agent_ids=["Agent"], + user_ids=["User"], + ) + + +@pytest.fixture +def mock_llm_interface(): + """Mock LLM interface.""" + return MagicMock(spec=LLMInterface) + + +@pytest.fixture +def quality_metric(mock_llm_interface): + """QualityMetric instance with mocked LLM.""" + return QualityMetric(llm_interface=mock_llm_interface) + + +def test_quality_metric_init(mock_llm_interface) -> None: + """Test QualityMetric initializes with correct name and LLM.""" + metric = QualityMetric(llm_interface=mock_llm_interface) + assert metric.name == "quality" + assert metric.llm_interface is mock_llm_interface + + +def test_quality_metric_custom_name(mock_llm_interface) -> None: + """Test QualityMetric accepts custom name.""" + metric = QualityMetric(llm_interface=mock_llm_interface, name="custom") + assert metric.name == "custom" + + +def test_get_prompt(quality_metric, dialogues) -> None: + """Test _get_prompt builds prompt with dialogue and rubric.""" + dialogue = dialogues[0] + prompt = quality_metric._get_prompt(QualityRubrics.FLUENCY, dialogue) + + assert "CONVERSATION HISTORY" in prompt + assert "USER: Utterance 1" in prompt + assert "ASSISTANT: Utterance 2" in prompt + assert "GRADING RUBRIC" in prompt + assert "Fluency" in prompt + assert '{"score"' in prompt + + +def test_get_prompt_all_aspects(quality_metric, dialogues) -> None: + """Test _get_prompt works for every quality aspect.""" + dialogue = dialogues[0] + for aspect in QualityRubrics: + prompt = quality_metric._get_prompt(aspect, dialogue) + assert aspect.value in prompt + + +def test_evaluate_dialogue_valid_response(quality_metric, dialogues) -> None: + """Test evaluate_dialogue parses a valid LLM JSON response.""" + quality_metric.llm_interface.get_llm_api_response.return_value = ( + '{"score": 4, "score_explanation": "Good fluency."}' + ) + score = quality_metric.evaluate_dialogue(dialogues[0], aspect="FLUENCY") + assert score == 4.0 + + +def test_evaluate_dialogue_all_aspects(quality_metric, dialogues) -> None: + """Test evaluate_dialogue succeeds for each aspect name.""" + quality_metric.llm_interface.get_llm_api_response.return_value = ( + '{"score": 3, "score_explanation": "Average."}' + ) + for aspect in QualityRubrics: + score = quality_metric.evaluate_dialogue( + dialogues[0], aspect=aspect.name + ) + assert score == 3.0 + + +def test_evaluate_dialogue_missing_score_key(quality_metric, dialogues) -> None: + """Test evaluate_dialogue returns 0.0 when 'score' key is missing.""" + quality_metric.llm_interface.get_llm_api_response.return_value = ( + '{"explanation": "No score field."}' + ) + score = quality_metric.evaluate_dialogue(dialogues[0], aspect="FLUENCY") + assert score == 0.0 + + +def test_evaluate_dialogue_unknown_aspect(quality_metric, dialogues) -> None: + """Test evaluate_dialogue raises KeyError for unsupported aspect.""" + with pytest.raises(KeyError, match="Unknown aspect"): + quality_metric.evaluate_dialogue(dialogues[0], aspect="NONEXISTENT") + + +def test_evaluate_dialogues(quality_metric, dialogues) -> None: + """Test evaluate_dialogues returns scores keyed by conversation ID.""" + quality_metric.llm_interface.get_llm_api_response.return_value = ( + '{"score": 5, "score_explanation": "Excellent."}' + ) + results = quality_metric.evaluate_dialogues(dialogues, aspect="OVERALL_SAT") + assert len(results) == len(dialogues) + for dialogue in dialogues: + assert dialogue.conversation_id in results + assert results[dialogue.conversation_id] == 5.0 diff --git a/tests/evaluation/test_satisfaction_metric.py b/tests/evaluation/test_satisfaction_metric.py new file mode 100644 index 00000000..bd857060 --- /dev/null +++ b/tests/evaluation/test_satisfaction_metric.py @@ -0,0 +1,80 @@ +"""Tests for SatisfactionMetric.""" + +from unittest.mock import MagicMock + +import pytest + +from dialoguekit.nlu.models.satisfaction_classifier import ( + SatisfactionClassifier, +) +from dialoguekit.utils.dialogue_reader import json_to_dialogues + +from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric + + +@pytest.fixture +def dialogues(): + """Load test dialogues.""" + return json_to_dialogues( + "tests/data/annotated_dialogues.json", + agent_ids=["Agent"], + user_ids=["User"], + ) + + +@pytest.fixture +def mock_classifier(): + """Mock satisfaction classifier.""" + return MagicMock(spec=SatisfactionClassifier) + + +@pytest.fixture +def satisfaction_metric(mock_classifier): + """SatisfactionMetric instance with mocked classifier.""" + return SatisfactionMetric(classifier=mock_classifier) + + +def test_satisfaction_metric_init(mock_classifier) -> None: + """Test SatisfactionMetric initializes with correct name.""" + metric = SatisfactionMetric(classifier=mock_classifier) + assert metric.name == "satisfaction" + assert metric.classifier is mock_classifier + + +def test_satisfaction_metric_custom_name(mock_classifier) -> None: + """Test SatisfactionMetric accepts custom name.""" + metric = SatisfactionMetric(classifier=mock_classifier, name="sat_v2") + assert metric.name == "sat_v2" + + +def test_evaluate_dialogue(satisfaction_metric, dialogues) -> None: + """Test evaluate_dialogue returns classifier score as float.""" + satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 4 + score = satisfaction_metric.evaluate_dialogue(dialogues[0]) + assert score == 4.0 + classify = satisfaction_metric.classifier.classify_last_n_dialogue + classify.assert_called_once_with(dialogues[0], last_n=None) + + +def test_evaluate_dialogue_low_score(satisfaction_metric, dialogues) -> None: + """Test evaluate_dialogue with a low satisfaction score.""" + satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 1 + score = satisfaction_metric.evaluate_dialogue(dialogues[0]) + assert score == 1.0 + + +def test_evaluate_dialogue_float_score(satisfaction_metric, dialogues) -> None: + """Test evaluate_dialogue handles fractional classifier output.""" + satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 3.7 + score = satisfaction_metric.evaluate_dialogue(dialogues[0]) + assert score == pytest.approx(3.7) + + +def test_evaluate_dialogues(satisfaction_metric, dialogues) -> None: + """Test evaluate_dialogues returns scores keyed by conversation ID.""" + satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 3 + results = satisfaction_metric.evaluate_dialogues(dialogues) + assert len(results) == len(dialogues) + for dialogue in dialogues: + assert dialogue.conversation_id in results + assert results[dialogue.conversation_id] == 3.0 From 1c802ffd1f39636fc696f8d4a415c8552f5da14c Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 10 Mar 2026 17:40:02 +0100 Subject: [PATCH 29/38] changes --- tests/evaluation/test_utility_metric.py | 152 ++++++++++-------- usersimcrs/evaluation/dialogue_annotation.py | 37 ----- .../reward_per_dialogue_length_metric.py | 10 +- ...ssful_recommendation_round_ratio_metric.py | 12 +- usersimcrs/evaluation/utility_base_metric.py | 22 ++- 5 files changed, 101 insertions(+), 132 deletions(-) diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py index 72e221fc..aff046bc 100644 --- a/tests/evaluation/test_utility_metric.py +++ b/tests/evaluation/test_utility_metric.py @@ -14,10 +14,8 @@ SuccessfulRecommendationRoundRatioMetric, ) -_MOCK_NLU = MagicMock() -_PREPARE_DIALOGUE_PATH = ( - "usersimcrs.evaluation.utility_base_metric.prepare_dialogue" -) +_LOAD_NLU_PATH = "usersimcrs.evaluation.utility_base_metric.load_nlu" +_ANNOTATE_PATH = "usersimcrs.evaluation.utility_base_metric.annotate_dialogue" @pytest.fixture @@ -38,88 +36,110 @@ def success_rate_metric(): ) -@pytest.fixture -def successful_round_ratio_metric(): - return SuccessfulRecommendationRoundRatioMetric( - user_nlu_config_path="dummy_user_nlu.yaml", - agent_nlu_config_path="dummy_agent_nlu.yaml", - ) - +def test_success_rate_init() -> None: + """Test SuccessRateMetric default and custom name.""" + metric = SuccessRateMetric() + assert metric.name == "success_rate" -@pytest.fixture -def reward_per_dialogue_length_metric(): - return RewardPerDialogueLengthMetric( - user_nlu_config_path="dummy_user_nlu.yaml", - agent_nlu_config_path="dummy_agent_nlu.yaml", - ) + metric = SuccessRateMetric(name="custom_sr") + assert metric.name == "custom_sr" def test_success_rate_evaluate_dialogue( success_rate_metric: SuccessRateMetric, dialogues ) -> None: - """Test SuccessRateMetric.evaluate_dialogue.""" + """Test SuccessRateMetric returns 1.0 for accepted dialogue.""" dialogue = dialogues[0] + mock_nlu = MagicMock() with ( - patch( - _PREPARE_DIALOGUE_PATH, - return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU), + patch(_LOAD_NLU_PATH, return_value=mock_nlu), + patch(_ANNOTATE_PATH), + patch.object( + SuccessRateMetric, + "_assess_dialogue", + return_value=True, ), - patch.object(SuccessRateMetric, "_assess_dialogue", return_value=True), ): assert success_rate_metric.evaluate_dialogue(dialogue) == 1.0 -def test_success_rate_evaluate_dialogue_unsuccessful( - success_rate_metric: SuccessRateMetric, dialogues -) -> None: - """Test SuccessRateMetric.evaluate_dialogue for failed dialogue.""" +def test_success_rate_without_nlu_paths(dialogues) -> None: + """Test SuccessRateMetric works on pre-annotated dialogues.""" + metric = SuccessRateMetric() dialogue = dialogues[0] - with ( - patch( - _PREPARE_DIALOGUE_PATH, - return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU), - ), - patch.object(SuccessRateMetric, "_assess_dialogue", return_value=False), + with patch.object(SuccessRateMetric, "_assess_dialogue", return_value=True): + assert metric.evaluate_dialogue(dialogue) == 1.0 + + +@pytest.fixture +def srrr_metric(): + return SuccessfulRecommendationRoundRatioMetric() + + +def test_srrr_init() -> None: + """Test SRRR metric default and custom name.""" + metric = SuccessfulRecommendationRoundRatioMetric() + assert metric.name == "successful_recommendation_round_ratio" + + metric = SuccessfulRecommendationRoundRatioMetric(name="srrr_v2") + assert metric.name == "srrr_v2" + + +def test_srrr_evaluate_dialogue(srrr_metric, dialogues) -> None: + """Test SRRR returns correct ratio.""" + dialogue = dialogues[0] + with patch.object( + SuccessfulRecommendationRoundRatioMetric, + "_assess_dialogue", + return_value=(1, 2), ): - assert success_rate_metric.evaluate_dialogue(dialogue) == 0.0 + assert srrr_metric.evaluate_dialogue(dialogue) == 0.5 -def test_successful_recommendation_round_ratio_evaluate_dialogue( - successful_round_ratio_metric: SuccessfulRecommendationRoundRatioMetric, - dialogues, -) -> None: - """Test SuccessfulRecommendationRoundRatioMetric.evaluate_dialogue.""" +def test_srrr_all_rounds_successful(srrr_metric, dialogues) -> None: + """Test SRRR returns 1.0 when all rounds accepted.""" dialogue = dialogues[0] - with ( - patch( - _PREPARE_DIALOGUE_PATH, - return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU), - ), - patch.object( - SuccessfulRecommendationRoundRatioMetric, - "_assess_dialogue", - return_value=(1, 2), - ), + with patch.object( + SuccessfulRecommendationRoundRatioMetric, + "_assess_dialogue", + return_value=(3, 3), ): - assert successful_round_ratio_metric.evaluate_dialogue(dialogue) == 0.5 + assert srrr_metric.evaluate_dialogue(dialogue) == 1.0 -def test_reward_per_dialogue_length_evaluate_dialogue( - reward_per_dialogue_length_metric: RewardPerDialogueLengthMetric, dialogues -) -> None: - """Test RewardPerDialogueLengthMetric.evaluate_dialogue.""" +def test_srrr_no_successful_rounds(srrr_metric, dialogues) -> None: + """Test SRRR returns 0.0 when no rounds are accepted.""" dialogue = dialogues[0] - with ( - patch( - _PREPARE_DIALOGUE_PATH, - return_value=(dialogue, [], [], [], _MOCK_NLU, _MOCK_NLU), - ), - patch.object( - RewardPerDialogueLengthMetric, - "_assess_dialogue", - return_value=(1, 10), - ), + with patch.object( + SuccessfulRecommendationRoundRatioMetric, + "_assess_dialogue", + return_value=(0, 4), + ): + assert srrr_metric.evaluate_dialogue(dialogue) == 0.0 + + +@pytest.fixture +def rdl_metric(): + return RewardPerDialogueLengthMetric() + + +def test_rdl_no_accepted(rdl_metric, dialogues) -> None: + """Test RDL returns 0.0 when no recommendations accepted.""" + dialogue = dialogues[0] + with patch.object( + RewardPerDialogueLengthMetric, + "_assess_dialogue", + return_value=(0, 7), + ): + assert rdl_metric.evaluate_dialogue(dialogue) == 0.0 + + +def test_rdl_multiple_accepted(rdl_metric, dialogues) -> None: + """Test RDL with several accepted recommendations.""" + dialogue = dialogues[0] + with patch.object( + RewardPerDialogueLengthMetric, + "_assess_dialogue", + return_value=(3, 10), ): - assert ( - reward_per_dialogue_length_metric.evaluate_dialogue(dialogue) == 0.1 - ) + assert rdl_metric.evaluate_dialogue(dialogue) == pytest.approx(0.3) diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py index 31c06018..4d4ec9c6 100644 --- a/usersimcrs/evaluation/dialogue_annotation.py +++ b/usersimcrs/evaluation/dialogue_annotation.py @@ -160,43 +160,6 @@ def get_recommendation_rounds( return rounds -def prepare_dialogue( - dialogue: Dialogue, - user_nlu_config_path: str, - agent_nlu_config_path: str, - cached_user_nlu: Optional[NLU] = None, - cached_agent_nlu: Optional[NLU] = None, - **kwargs: Any, -) -> Tuple[Dialogue, List[Intent], List[Intent], List[Intent], NLU, NLU]: - """Loads NLU modules, annotates a dialogue, and builds intent lists. - - Combines :func:`load_nlu`, :func:`annotate_dialogue`, and - :func:`get_intent_lists` into a single convenience call. - - Args: - dialogue: Dialogue to prepare. - user_nlu_config_path: Path to user NLU configuration file. - agent_nlu_config_path: Path to agent NLU configuration file. - cached_user_nlu: Previously loaded user NLU module (avoids reload). - cached_agent_nlu: Previously loaded agent NLU module (avoids reload). - **kwargs: Optional intent label overrides forwarded to - :func:`get_intent_lists`. - - Returns: - Tuple of (annotated dialogue, recommendation intents, - acceptance intents, rejection intents, user NLU, agent NLU). - """ - user_nlu = load_nlu( - user_nlu_config_path, "User NLU Configuration", cached_user_nlu - ) - agent_nlu = load_nlu( - agent_nlu_config_path, "Agent NLU Configuration", cached_agent_nlu - ) - annotate_dialogue(dialogue, user_nlu, agent_nlu) - rec, acc, rej = get_intent_lists(**kwargs) - return dialogue, rec, acc, rej, user_nlu, agent_nlu - - def is_recommendation_accepted( round_utterances: List[AnnotatedUtterance], acceptance_intents: List[Intent], diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py index 1727cc32..aca86dc6 100644 --- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py +++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py @@ -3,7 +3,7 @@ Evaluates the ratio of accepted recommendations to total dialogue length. """ -from typing import Any, List, Optional, Tuple +from typing import Any, List, Tuple from dialoguekit.core.dialogue import Dialogue from dialoguekit.core.intent import Intent @@ -15,8 +15,6 @@ class RewardPerDialogueLengthMetric(UtilityBaseMetric): def __init__( self, - user_nlu_config_path: Optional[str] = None, - agent_nlu_config_path: Optional[str] = None, name: str = "reward_per_dialogue_length", ) -> None: """Initializes the reward-per-dialogue-length metric. @@ -26,11 +24,7 @@ def __init__( agent_nlu_config_path: Path to agent NLU configuration. name: Metric name. """ - super().__init__( - name, - user_nlu_config_path=user_nlu_config_path, - agent_nlu_config_path=agent_nlu_config_path, - ) + super().__init__(name) def _assess_dialogue( self, dialogue: Dialogue, acceptance_intents: List[Intent] diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py index 8f79a16b..09295572 100644 --- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py +++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py @@ -4,7 +4,7 @@ rounds in a dialogue. """ -from typing import Any, List, Optional, Tuple +from typing import Any, List, Tuple from dialoguekit.core.dialogue import Dialogue from dialoguekit.core.intent import Intent @@ -19,8 +19,6 @@ class SuccessfulRecommendationRoundRatioMetric(UtilityBaseMetric): def __init__( self, - user_nlu_config_path: Optional[str] = None, - agent_nlu_config_path: Optional[str] = None, name: str = "successful_recommendation_round_ratio", ) -> None: """Initializes the successful recommendation round ratio metric. @@ -32,8 +30,6 @@ def __init__( """ super().__init__( name, - user_nlu_config_path=user_nlu_config_path, - agent_nlu_config_path=agent_nlu_config_path, ) def _assess_dialogue( @@ -71,11 +67,11 @@ def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: dialogue: Dialogue to evaluate. **kwargs: Optional intent label overrides: - recommendation_intent_labels: Labels for recommendation - intents. Defaults to ``["REC-S", "REC-E"]``. + intents. Defaults to ["REC-S", "REC-E"]. - acceptance_intent_labels: Labels for acceptance intents. - Defaults to ``["ACC"]``. + Defaults to ["ACC"]. - rejection_intent_labels: Labels for rejection intents. - Defaults to ``["REJ"]``. + Defaults to ["REJ"]. Returns: Ratio of accepted recommendation rounds to total rounds, diff --git a/usersimcrs/evaluation/utility_base_metric.py b/usersimcrs/evaluation/utility_base_metric.py index 17618b65..c8a53b29 100644 --- a/usersimcrs/evaluation/utility_base_metric.py +++ b/usersimcrs/evaluation/utility_base_metric.py @@ -9,8 +9,9 @@ from usersimcrs.evaluation.base_metric import BaseMetric from usersimcrs.evaluation.dialogue_annotation import ( + annotate_dialogue, get_intent_lists, - prepare_dialogue, + load_nlu, ) @@ -55,20 +56,15 @@ def _resolve_intents( rejection_intents). """ if self._user_nlu_config_path and self._agent_nlu_config_path: - ( - _, - rec, - acc, - rej, - self._user_nlu, - self._agent_nlu, - ) = prepare_dialogue( - dialogue, + self._user_nlu = load_nlu( self._user_nlu_config_path, - self._agent_nlu_config_path, + "User NLU Configuration", self._user_nlu, + ) + self._agent_nlu = load_nlu( + self._agent_nlu_config_path, + "Agent NLU Configuration", self._agent_nlu, - **kwargs, ) - return rec, acc, rej + annotate_dialogue(dialogue, self._user_nlu, self._agent_nlu) return get_intent_lists(**kwargs) From 833500529752461d48f57246b0904c25ec82e40c Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 17 Mar 2026 13:23:25 +0100 Subject: [PATCH 30/38] simplyfying --- tests/evaluation/__init__.py | 0 tests/evaluation/test_quality_metric.py | 115 -------------- tests/evaluation/test_satisfaction_metric.py | 80 ---------- tests/evaluation/test_utility_metric.py | 145 ------------------ usersimcrs/evaluation/dialogue_annotation.py | 67 ++++---- .../reward_per_dialogue_length_metric.py | 63 +++----- usersimcrs/evaluation/success_rate_metric.py | 68 ++++---- ...ssful_recommendation_round_ratio_metric.py | 80 ++++------ usersimcrs/evaluation/utility_base.py | 56 +++++++ usersimcrs/evaluation/utility_base_metric.py | 70 --------- 10 files changed, 161 insertions(+), 583 deletions(-) delete mode 100644 tests/evaluation/__init__.py delete mode 100644 tests/evaluation/test_quality_metric.py delete mode 100644 tests/evaluation/test_satisfaction_metric.py delete mode 100644 tests/evaluation/test_utility_metric.py create mode 100644 usersimcrs/evaluation/utility_base.py delete mode 100644 usersimcrs/evaluation/utility_base_metric.py diff --git a/tests/evaluation/__init__.py b/tests/evaluation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/evaluation/test_quality_metric.py b/tests/evaluation/test_quality_metric.py deleted file mode 100644 index 3fdf40ec..00000000 --- a/tests/evaluation/test_quality_metric.py +++ /dev/null @@ -1,115 +0,0 @@ -"""Tests for QualityMetric.""" - -from unittest.mock import MagicMock - -import pytest - -from dialoguekit.utils.dialogue_reader import json_to_dialogues - -from usersimcrs.evaluation.quality_metric import QualityMetric -from usersimcrs.evaluation.quality_rubrics import QualityRubrics -from usersimcrs.llm_interfaces.llm_interface import LLMInterface - - -@pytest.fixture -def dialogues(): - """Load test dialogues.""" - return json_to_dialogues( - "tests/data/annotated_dialogues.json", - agent_ids=["Agent"], - user_ids=["User"], - ) - - -@pytest.fixture -def mock_llm_interface(): - """Mock LLM interface.""" - return MagicMock(spec=LLMInterface) - - -@pytest.fixture -def quality_metric(mock_llm_interface): - """QualityMetric instance with mocked LLM.""" - return QualityMetric(llm_interface=mock_llm_interface) - - -def test_quality_metric_init(mock_llm_interface) -> None: - """Test QualityMetric initializes with correct name and LLM.""" - metric = QualityMetric(llm_interface=mock_llm_interface) - assert metric.name == "quality" - assert metric.llm_interface is mock_llm_interface - - -def test_quality_metric_custom_name(mock_llm_interface) -> None: - """Test QualityMetric accepts custom name.""" - metric = QualityMetric(llm_interface=mock_llm_interface, name="custom") - assert metric.name == "custom" - - -def test_get_prompt(quality_metric, dialogues) -> None: - """Test _get_prompt builds prompt with dialogue and rubric.""" - dialogue = dialogues[0] - prompt = quality_metric._get_prompt(QualityRubrics.FLUENCY, dialogue) - - assert "CONVERSATION HISTORY" in prompt - assert "USER: Utterance 1" in prompt - assert "ASSISTANT: Utterance 2" in prompt - assert "GRADING RUBRIC" in prompt - assert "Fluency" in prompt - assert '{"score"' in prompt - - -def test_get_prompt_all_aspects(quality_metric, dialogues) -> None: - """Test _get_prompt works for every quality aspect.""" - dialogue = dialogues[0] - for aspect in QualityRubrics: - prompt = quality_metric._get_prompt(aspect, dialogue) - assert aspect.value in prompt - - -def test_evaluate_dialogue_valid_response(quality_metric, dialogues) -> None: - """Test evaluate_dialogue parses a valid LLM JSON response.""" - quality_metric.llm_interface.get_llm_api_response.return_value = ( - '{"score": 4, "score_explanation": "Good fluency."}' - ) - score = quality_metric.evaluate_dialogue(dialogues[0], aspect="FLUENCY") - assert score == 4.0 - - -def test_evaluate_dialogue_all_aspects(quality_metric, dialogues) -> None: - """Test evaluate_dialogue succeeds for each aspect name.""" - quality_metric.llm_interface.get_llm_api_response.return_value = ( - '{"score": 3, "score_explanation": "Average."}' - ) - for aspect in QualityRubrics: - score = quality_metric.evaluate_dialogue( - dialogues[0], aspect=aspect.name - ) - assert score == 3.0 - - -def test_evaluate_dialogue_missing_score_key(quality_metric, dialogues) -> None: - """Test evaluate_dialogue returns 0.0 when 'score' key is missing.""" - quality_metric.llm_interface.get_llm_api_response.return_value = ( - '{"explanation": "No score field."}' - ) - score = quality_metric.evaluate_dialogue(dialogues[0], aspect="FLUENCY") - assert score == 0.0 - - -def test_evaluate_dialogue_unknown_aspect(quality_metric, dialogues) -> None: - """Test evaluate_dialogue raises KeyError for unsupported aspect.""" - with pytest.raises(KeyError, match="Unknown aspect"): - quality_metric.evaluate_dialogue(dialogues[0], aspect="NONEXISTENT") - - -def test_evaluate_dialogues(quality_metric, dialogues) -> None: - """Test evaluate_dialogues returns scores keyed by conversation ID.""" - quality_metric.llm_interface.get_llm_api_response.return_value = ( - '{"score": 5, "score_explanation": "Excellent."}' - ) - results = quality_metric.evaluate_dialogues(dialogues, aspect="OVERALL_SAT") - assert len(results) == len(dialogues) - for dialogue in dialogues: - assert dialogue.conversation_id in results - assert results[dialogue.conversation_id] == 5.0 diff --git a/tests/evaluation/test_satisfaction_metric.py b/tests/evaluation/test_satisfaction_metric.py deleted file mode 100644 index bd857060..00000000 --- a/tests/evaluation/test_satisfaction_metric.py +++ /dev/null @@ -1,80 +0,0 @@ -"""Tests for SatisfactionMetric.""" - -from unittest.mock import MagicMock - -import pytest - -from dialoguekit.nlu.models.satisfaction_classifier import ( - SatisfactionClassifier, -) -from dialoguekit.utils.dialogue_reader import json_to_dialogues - -from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric - - -@pytest.fixture -def dialogues(): - """Load test dialogues.""" - return json_to_dialogues( - "tests/data/annotated_dialogues.json", - agent_ids=["Agent"], - user_ids=["User"], - ) - - -@pytest.fixture -def mock_classifier(): - """Mock satisfaction classifier.""" - return MagicMock(spec=SatisfactionClassifier) - - -@pytest.fixture -def satisfaction_metric(mock_classifier): - """SatisfactionMetric instance with mocked classifier.""" - return SatisfactionMetric(classifier=mock_classifier) - - -def test_satisfaction_metric_init(mock_classifier) -> None: - """Test SatisfactionMetric initializes with correct name.""" - metric = SatisfactionMetric(classifier=mock_classifier) - assert metric.name == "satisfaction" - assert metric.classifier is mock_classifier - - -def test_satisfaction_metric_custom_name(mock_classifier) -> None: - """Test SatisfactionMetric accepts custom name.""" - metric = SatisfactionMetric(classifier=mock_classifier, name="sat_v2") - assert metric.name == "sat_v2" - - -def test_evaluate_dialogue(satisfaction_metric, dialogues) -> None: - """Test evaluate_dialogue returns classifier score as float.""" - satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 4 - score = satisfaction_metric.evaluate_dialogue(dialogues[0]) - assert score == 4.0 - classify = satisfaction_metric.classifier.classify_last_n_dialogue - classify.assert_called_once_with(dialogues[0], last_n=None) - - -def test_evaluate_dialogue_low_score(satisfaction_metric, dialogues) -> None: - """Test evaluate_dialogue with a low satisfaction score.""" - satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 1 - score = satisfaction_metric.evaluate_dialogue(dialogues[0]) - assert score == 1.0 - - -def test_evaluate_dialogue_float_score(satisfaction_metric, dialogues) -> None: - """Test evaluate_dialogue handles fractional classifier output.""" - satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 3.7 - score = satisfaction_metric.evaluate_dialogue(dialogues[0]) - assert score == pytest.approx(3.7) - - -def test_evaluate_dialogues(satisfaction_metric, dialogues) -> None: - """Test evaluate_dialogues returns scores keyed by conversation ID.""" - satisfaction_metric.classifier.classify_last_n_dialogue.return_value = 3 - results = satisfaction_metric.evaluate_dialogues(dialogues) - assert len(results) == len(dialogues) - for dialogue in dialogues: - assert dialogue.conversation_id in results - assert results[dialogue.conversation_id] == 3.0 diff --git a/tests/evaluation/test_utility_metric.py b/tests/evaluation/test_utility_metric.py deleted file mode 100644 index aff046bc..00000000 --- a/tests/evaluation/test_utility_metric.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Tests for utility metric classes.""" - -from unittest.mock import MagicMock, patch - -import pytest - -from dialoguekit.utils.dialogue_reader import json_to_dialogues - -from usersimcrs.evaluation.reward_per_dialogue_length_metric import ( - RewardPerDialogueLengthMetric, -) -from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric -from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import ( - SuccessfulRecommendationRoundRatioMetric, -) - -_LOAD_NLU_PATH = "usersimcrs.evaluation.utility_base_metric.load_nlu" -_ANNOTATE_PATH = "usersimcrs.evaluation.utility_base_metric.annotate_dialogue" - - -@pytest.fixture -def dialogues(): - """Load test dialogues.""" - return json_to_dialogues( - "tests/data/annotated_dialogues.json", - agent_ids=["Agent"], - user_ids=["User"], - ) - - -@pytest.fixture -def success_rate_metric(): - return SuccessRateMetric( - user_nlu_config_path="dummy_user_nlu.yaml", - agent_nlu_config_path="dummy_agent_nlu.yaml", - ) - - -def test_success_rate_init() -> None: - """Test SuccessRateMetric default and custom name.""" - metric = SuccessRateMetric() - assert metric.name == "success_rate" - - metric = SuccessRateMetric(name="custom_sr") - assert metric.name == "custom_sr" - - -def test_success_rate_evaluate_dialogue( - success_rate_metric: SuccessRateMetric, dialogues -) -> None: - """Test SuccessRateMetric returns 1.0 for accepted dialogue.""" - dialogue = dialogues[0] - mock_nlu = MagicMock() - with ( - patch(_LOAD_NLU_PATH, return_value=mock_nlu), - patch(_ANNOTATE_PATH), - patch.object( - SuccessRateMetric, - "_assess_dialogue", - return_value=True, - ), - ): - assert success_rate_metric.evaluate_dialogue(dialogue) == 1.0 - - -def test_success_rate_without_nlu_paths(dialogues) -> None: - """Test SuccessRateMetric works on pre-annotated dialogues.""" - metric = SuccessRateMetric() - dialogue = dialogues[0] - with patch.object(SuccessRateMetric, "_assess_dialogue", return_value=True): - assert metric.evaluate_dialogue(dialogue) == 1.0 - - -@pytest.fixture -def srrr_metric(): - return SuccessfulRecommendationRoundRatioMetric() - - -def test_srrr_init() -> None: - """Test SRRR metric default and custom name.""" - metric = SuccessfulRecommendationRoundRatioMetric() - assert metric.name == "successful_recommendation_round_ratio" - - metric = SuccessfulRecommendationRoundRatioMetric(name="srrr_v2") - assert metric.name == "srrr_v2" - - -def test_srrr_evaluate_dialogue(srrr_metric, dialogues) -> None: - """Test SRRR returns correct ratio.""" - dialogue = dialogues[0] - with patch.object( - SuccessfulRecommendationRoundRatioMetric, - "_assess_dialogue", - return_value=(1, 2), - ): - assert srrr_metric.evaluate_dialogue(dialogue) == 0.5 - - -def test_srrr_all_rounds_successful(srrr_metric, dialogues) -> None: - """Test SRRR returns 1.0 when all rounds accepted.""" - dialogue = dialogues[0] - with patch.object( - SuccessfulRecommendationRoundRatioMetric, - "_assess_dialogue", - return_value=(3, 3), - ): - assert srrr_metric.evaluate_dialogue(dialogue) == 1.0 - - -def test_srrr_no_successful_rounds(srrr_metric, dialogues) -> None: - """Test SRRR returns 0.0 when no rounds are accepted.""" - dialogue = dialogues[0] - with patch.object( - SuccessfulRecommendationRoundRatioMetric, - "_assess_dialogue", - return_value=(0, 4), - ): - assert srrr_metric.evaluate_dialogue(dialogue) == 0.0 - - -@pytest.fixture -def rdl_metric(): - return RewardPerDialogueLengthMetric() - - -def test_rdl_no_accepted(rdl_metric, dialogues) -> None: - """Test RDL returns 0.0 when no recommendations accepted.""" - dialogue = dialogues[0] - with patch.object( - RewardPerDialogueLengthMetric, - "_assess_dialogue", - return_value=(0, 7), - ): - assert rdl_metric.evaluate_dialogue(dialogue) == 0.0 - - -def test_rdl_multiple_accepted(rdl_metric, dialogues) -> None: - """Test RDL with several accepted recommendations.""" - dialogue = dialogues[0] - with patch.object( - RewardPerDialogueLengthMetric, - "_assess_dialogue", - return_value=(3, 10), - ): - assert rdl_metric.evaluate_dialogue(dialogue) == pytest.approx(0.3) diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py index 4d4ec9c6..fbefe441 100644 --- a/usersimcrs/evaluation/dialogue_annotation.py +++ b/usersimcrs/evaluation/dialogue_annotation.py @@ -1,11 +1,11 @@ """Dialogue annotation and recommendation round utilities. Provides functions for annotating dialogues with dialogue acts using NLU -modules, parsing intent labels, and extracting recommendation rounds from -annotated dialogues. +modules, extracting recommendation rounds from annotated dialogues, and +assessing recommendation acceptance. """ -from typing import Any, List, Optional, Tuple +from typing import Dict, List, Optional, Sequence, Tuple from confuse import Configuration @@ -18,6 +18,27 @@ from usersimcrs.utils.simulation_utils import get_NLU +_intent_cache: Dict[Tuple[str, ...], List[Intent]] = {} + + +def resolve_intents( + labels: Optional[Sequence[str]], defaults: List[str] +) -> List[Intent]: + """Resolves optional label overrides to a cached list of Intents. + + Args: + labels: Custom labels or None to use defaults. + defaults: Default label strings. + + Returns: + Cached list of Intent objects. + """ + key = tuple(labels if labels is not None else defaults) + if key not in _intent_cache: + _intent_cache[key] = [Intent(label) for label in key] + return _intent_cache[key] + + def annotate_dialogue( dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU ) -> Dialogue: @@ -62,56 +83,22 @@ def annotate_dialogue( def load_nlu( nlu_config_path: str, config_name: str = "NLU Configuration", - cached_nlu: Optional[NLU] = None, ) -> NLU: - """Loads a single NLU module. - - Returns the cached instance when provided, otherwise creates a new one - from the given configuration file. + """Loads a single NLU module from the given configuration file. Args: nlu_config_path: Path to the NLU configuration file. - config_name: Name for the Configuration instance. - cached_nlu: Previously loaded NLU module. + config_name: Name for the Configuration instance. Defaults to + ``"NLU Configuration"``. Returns: NLU module. """ - if cached_nlu is not None: - return cached_nlu nlu_config = Configuration(config_name) nlu_config.set_file(nlu_config_path) return get_NLU(nlu_config) -def get_intent_lists( - **kwargs: Any, -) -> Tuple[List[Intent], List[Intent], List[Intent]]: - """Builds recommendation, acceptance, and rejection intent lists. - - Args: - **kwargs: Optional intent label overrides: - - recommendation_intent_labels: Labels for recommendation intents. - Defaults to ``["REC-S", "REC-E"]``. - - acceptance_intent_labels: Labels for acceptance intents. - Defaults to ``["ACC"]``. - - rejection_intent_labels: Labels for rejection intents. - Defaults to ``["REJ"]``. - - Returns: - Tuple of (recommendation_intents, acceptance_intents, - rejection_intents). - """ - rec_labels = kwargs.get("recommendation_intent_labels", ["REC-S", "REC-E"]) - acc_labels = kwargs.get("acceptance_intent_labels", ["ACC"]) - rej_labels = kwargs.get("rejection_intent_labels", ["REJ"]) - return ( - [Intent(label) for label in rec_labels], - [Intent(label) for label in acc_labels], - [Intent(label) for label in rej_labels], - ) - - def annotate_dialogues( dialogues: List[Dialogue], user_nlu_config_path: str, diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py index aca86dc6..e28de982 100644 --- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py +++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py @@ -3,16 +3,18 @@ Evaluates the ratio of accepted recommendations to total dialogue length. """ -from typing import Any, List, Tuple +from typing import Any, List, Optional from dialoguekit.core.dialogue import Dialogue -from dialoguekit.core.intent import Intent from dialoguekit.participant.participant import DialogueParticipant +from usersimcrs.evaluation.dialogue_annotation import ( + resolve_intents, +) -from usersimcrs.evaluation.utility_base_metric import UtilityBaseMetric +from usersimcrs.evaluation.utility_base import DEFAULT_ACC_LABELS, UtilityBase -class RewardPerDialogueLengthMetric(UtilityBaseMetric): +class RewardPerDialogueLengthMetric(UtilityBase): def __init__( self, name: str = "reward_per_dialogue_length", @@ -20,53 +22,32 @@ def __init__( """Initializes the reward-per-dialogue-length metric. Args: - user_nlu_config_path: Path to user NLU configuration. - agent_nlu_config_path: Path to agent NLU configuration. name: Metric name. """ super().__init__(name) - def _assess_dialogue( - self, dialogue: Dialogue, acceptance_intents: List[Intent] - ) -> Tuple[int, int]: - """Returns accepted recommendations and dialogue length. - - Args: - dialogue: Annotated dialogue. - acceptance_intents: Intents that signal acceptance. - - Returns: - Tuple of (accepted_recommendations, dialogue_length). - """ - nb_accepted_recommendations = sum( - 1 - for utterance in dialogue.utterances - if utterance.participant == DialogueParticipant.USER - and any( - intent in acceptance_intents - for intent in utterance.get_intents() - ) - ) - return nb_accepted_recommendations, len(dialogue.utterances) - - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + def evaluate_dialogue( + self, + dialogue: Dialogue, + acceptance_intent_labels: Optional[List[str]] = None, + **kwargs: Any, + ) -> float: """Computes the reward-per-dialogue-length score. Args: dialogue: Dialogue to evaluate. - **kwargs: Optional intent label overrides: - - recommendation_intent_labels: Labels for recommendation - intents. Defaults to ["REC-S", "REC-E"]. - - acceptance_intent_labels: Labels for acceptance intents. - Defaults to ["ACC"]. - - rejection_intent_labels: Labels for rejection intents. - Defaults to ["REJ"]. + acceptance_intent_labels: Labels for acceptance intents. + Defaults to ``["ACC"]``. Returns: Ratio of accepted recommendations to total utterances. """ - _, acc, _ = self._resolve_intents(dialogue=dialogue, **kwargs) - nb_accepted, dialogue_length = self._assess_dialogue( - dialogue=dialogue, acceptance_intents=acc + self._annotate_if_needed(dialogue) + acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS) + nb_accepted = sum( + 1 + for utterance in dialogue.utterances + if utterance.participant == DialogueParticipant.USER + and any(intent in acc for intent in utterance.get_intents()) ) - return nb_accepted / dialogue_length + return nb_accepted / len(dialogue.utterances) diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py index e0bb9566..f9926206 100644 --- a/usersimcrs/evaluation/success_rate_metric.py +++ b/usersimcrs/evaluation/success_rate_metric.py @@ -6,16 +6,21 @@ from typing import Any, List, Optional from dialoguekit.core.dialogue import Dialogue -from dialoguekit.core.intent import Intent from usersimcrs.evaluation.dialogue_annotation import ( get_recommendation_rounds, is_recommendation_accepted, + resolve_intents, +) +from usersimcrs.evaluation.utility_base import ( + DEFAULT_ACC_LABELS, + DEFAULT_REC_LABELS, + DEFAULT_REJ_LABELS, + UtilityBase, ) -from usersimcrs.evaluation.utility_base_metric import UtilityBaseMetric -class SuccessRateMetric(UtilityBaseMetric): +class SuccessRateMetric(UtilityBase): def __init__( self, user_nlu_config_path: Optional[str] = None, @@ -35,54 +40,33 @@ def __init__( agent_nlu_config_path=agent_nlu_config_path, ) - def _assess_dialogue( + def evaluate_dialogue( self, dialogue: Dialogue, - recommendation_intents: List[Intent], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], - ) -> bool: - """Checks whether at least one recommendation round was accepted. - - Args: - dialogue: Annotated dialogue. - recommendation_intents: Intents that signal a recommendation. - acceptance_intents: Intents that signal acceptance. - rejection_intents: Intents that signal rejection. - - Returns: - True if at least one round was accepted, False otherwise. - """ - rounds = get_recommendation_rounds(dialogue, recommendation_intents) - return any( - is_recommendation_accepted( - round_utterances, acceptance_intents, rejection_intents - ) - for round_utterances in rounds - ) - - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + recommendation_intent_labels: Optional[List[str]] = None, + acceptance_intent_labels: Optional[List[str]] = None, + rejection_intent_labels: Optional[List[str]] = None, + **kwargs: Any, + ) -> float: """Computes the success rate for a single dialogue. Args: dialogue: Dialogue to evaluate. - **kwargs: Optional intent label overrides: - - recommendation_intent_labels: Labels for recommendation - intents. Defaults to ["REC-S", "REC-E"]. - - acceptance_intent_labels: Labels for acceptance intents. - Defaults to ["ACC"]. - - rejection_intent_labels: Labels for rejection intents. - Defaults to ["REJ"]. + recommendation_intent_labels: Labels for recommendation intents. + Defaults to ``["REC-S", "REC-E"]``. + acceptance_intent_labels: Labels for acceptance intents. + Defaults to ``["ACC"]``. + rejection_intent_labels: Labels for rejection intents. + Defaults to ``["REJ"]``. Returns: 1.0 if at least one recommendation was accepted, 0.0 otherwise. """ - rec, acc, rej = self._resolve_intents(dialogue=dialogue, **kwargs) + self._annotate_if_needed(dialogue) + rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS) + acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS) + rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS) + rounds = get_recommendation_rounds(dialogue, rec) return float( - self._assess_dialogue( - dialogue=dialogue, - recommendation_intents=rec, - acceptance_intents=acc, - rejection_intents=rej, - ) + any(is_recommendation_accepted(r, acc, rej) for r in rounds) ) diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py index 09295572..471527fe 100644 --- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py +++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py @@ -4,19 +4,24 @@ rounds in a dialogue. """ -from typing import Any, List, Tuple +from typing import Any, List, Optional from dialoguekit.core.dialogue import Dialogue -from dialoguekit.core.intent import Intent from usersimcrs.evaluation.dialogue_annotation import ( get_recommendation_rounds, is_recommendation_accepted, + resolve_intents, +) +from usersimcrs.evaluation.utility_base import ( + DEFAULT_ACC_LABELS, + DEFAULT_REC_LABELS, + DEFAULT_REJ_LABELS, + UtilityBase, ) -from usersimcrs.evaluation.utility_base_metric import UtilityBaseMetric -class SuccessfulRecommendationRoundRatioMetric(UtilityBaseMetric): +class SuccessfulRecommendationRoundRatioMetric(UtilityBase): def __init__( self, name: str = "successful_recommendation_round_ratio", @@ -24,64 +29,39 @@ def __init__( """Initializes the successful recommendation round ratio metric. Args: - user_nlu_config_path: Path to user NLU configuration. - agent_nlu_config_path: Path to agent NLU configuration. name: Metric name. """ - super().__init__( - name, - ) + super().__init__(name) - def _assess_dialogue( + def evaluate_dialogue( self, dialogue: Dialogue, - recommendation_intents: List[Intent], - acceptance_intents: List[Intent], - rejection_intents: List[Intent], - ) -> Tuple[int, int]: - """Returns successful and total recommendation rounds. - - Args: - dialogue: Annotated dialogue. - recommendation_intents: Intents that signal a recommendation. - acceptance_intents: Intents that signal acceptance. - rejection_intents: Intents that signal rejection. - - Returns: - Tuple of (successful_rounds, total_rounds). - """ - rounds = get_recommendation_rounds(dialogue, recommendation_intents) - successful_rounds = sum( - 1 - for round_utterances in rounds - if is_recommendation_accepted( - round_utterances, acceptance_intents, rejection_intents - ) - ) - return successful_rounds, len(rounds) - - def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + recommendation_intent_labels: Optional[List[str]] = None, + acceptance_intent_labels: Optional[List[str]] = None, + rejection_intent_labels: Optional[List[str]] = None, + **kwargs: Any, + ) -> float: """Computes the successful recommendation round ratio. Args: dialogue: Dialogue to evaluate. - **kwargs: Optional intent label overrides: - - recommendation_intent_labels: Labels for recommendation - intents. Defaults to ["REC-S", "REC-E"]. - - acceptance_intent_labels: Labels for acceptance intents. - Defaults to ["ACC"]. - - rejection_intent_labels: Labels for rejection intents. - Defaults to ["REJ"]. + recommendation_intent_labels: Labels for recommendation intents. + Defaults to ``["REC-S", "REC-E"]``. + acceptance_intent_labels: Labels for acceptance intents. + Defaults to ``["ACC"]``. + rejection_intent_labels: Labels for rejection intents. + Defaults to ``["REJ"]``. Returns: Ratio of accepted recommendation rounds to total rounds, or 0.0 if there are no recommendation rounds. """ - rec, acc, rej = self._resolve_intents(dialogue=dialogue, **kwargs) - successful_rounds, total_rounds = self._assess_dialogue( - dialogue=dialogue, - recommendation_intents=rec, - acceptance_intents=acc, - rejection_intents=rej, + self._annotate_if_needed(dialogue) + rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS) + acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS) + rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS) + rounds = get_recommendation_rounds(dialogue, rec) + successful = sum( + 1 for r in rounds if is_recommendation_accepted(r, acc, rej) ) - return successful_rounds / total_rounds if total_rounds > 0 else 0.0 + return successful / len(rounds) if rounds else 0.0 diff --git a/usersimcrs/evaluation/utility_base.py b/usersimcrs/evaluation/utility_base.py new file mode 100644 index 00000000..7f87a754 --- /dev/null +++ b/usersimcrs/evaluation/utility_base.py @@ -0,0 +1,56 @@ +"""Base class for utility-centric dialogue evaluation metrics. + +Provides shared NLU loading, and dialogue annotation. +""" + +from abc import ABC +from typing import Optional + +from dialoguekit.core.dialogue import Dialogue +from dialoguekit.nlu.nlu import NLU + +from usersimcrs.evaluation.base_metric import BaseMetric +from usersimcrs.evaluation.dialogue_annotation import ( + annotate_dialogue, + load_nlu, +) + +DEFAULT_REC_LABELS = ["REC-S", "REC-E"] +DEFAULT_ACC_LABELS = ["ACC"] +DEFAULT_REJ_LABELS = ["REJ"] + + +class UtilityBase(BaseMetric, ABC): + def __init__( + self, + name: str, + user_nlu_config_path: Optional[str] = None, + agent_nlu_config_path: Optional[str] = None, + ) -> None: + """Initializes the utility metric. + + Args: + name: Metric name. + user_nlu_config_path: Path to user NLU configuration. + agent_nlu_config_path: Path to agent NLU configuration. + """ + super().__init__(name) + self._user_nlu_config_path = user_nlu_config_path + self._agent_nlu_config_path = agent_nlu_config_path + self._user_nlu: Optional[NLU] = None + self._agent_nlu: Optional[NLU] = None + + def _annotate_if_needed(self, dialogue: Dialogue) -> None: + """Annotates the dialogue with NLU if config paths are set.""" + if self._user_nlu_config_path and self._agent_nlu_config_path: + if self._user_nlu is None: + self._user_nlu = load_nlu( + self._user_nlu_config_path, + "User NLU Configuration", + ) + if self._agent_nlu is None: + self._agent_nlu = load_nlu( + self._agent_nlu_config_path, + "Agent NLU Configuration", + ) + annotate_dialogue(dialogue, self._user_nlu, self._agent_nlu) diff --git a/usersimcrs/evaluation/utility_base_metric.py b/usersimcrs/evaluation/utility_base_metric.py deleted file mode 100644 index c8a53b29..00000000 --- a/usersimcrs/evaluation/utility_base_metric.py +++ /dev/null @@ -1,70 +0,0 @@ -"""Base class for dialogue annotation support.""" - -from abc import ABC -from typing import Any, List, Optional, Tuple - -from dialoguekit.core.dialogue import Dialogue -from dialoguekit.core.intent import Intent -from dialoguekit.nlu.nlu import NLU - -from usersimcrs.evaluation.base_metric import BaseMetric -from usersimcrs.evaluation.dialogue_annotation import ( - annotate_dialogue, - get_intent_lists, - load_nlu, -) - - -class UtilityBaseMetric(BaseMetric, ABC): - """Shared base for metrics that optionally annotate dialogues via NLU. - - When NLU config paths are provided, dialogues are annotated automatically. - When omitted, dialogues must be pre-annotated. - """ - - def __init__( - self, - name: str, - user_nlu_config_path: Optional[str] = None, - agent_nlu_config_path: Optional[str] = None, - ) -> None: - """Initializes the utility metric. - - Args: - name: Metric name. - user_nlu_config_path: Path to user NLU configuration. - agent_nlu_config_path: Path to agent NLU configuration. - """ - super().__init__(name) - self._user_nlu_config_path = user_nlu_config_path - self._agent_nlu_config_path = agent_nlu_config_path - self._user_nlu: Optional[NLU] = None - self._agent_nlu: Optional[NLU] = None - - def _resolve_intents( - self, dialogue: Dialogue, **kwargs: Any - ) -> Tuple[List[Intent], List[Intent], List[Intent]]: - """Annotates the dialogue (if NLU paths are set) and returns intents. - - Args: - dialogue: Dialogue to prepare. - **kwargs: Optional intent label overrides forwarded to - :func:`get_intent_lists`. - - Returns: - Tuple of (recommendation_intents, acceptance_intents, - rejection_intents). - """ - if self._user_nlu_config_path and self._agent_nlu_config_path: - self._user_nlu = load_nlu( - self._user_nlu_config_path, - "User NLU Configuration", - self._user_nlu, - ) - self._agent_nlu = load_nlu( - self._agent_nlu_config_path, - "Agent NLU Configuration", - self._agent_nlu, - ) - annotate_dialogue(dialogue, self._user_nlu, self._agent_nlu) - return get_intent_lists(**kwargs) From f006930c957aa8e33f4e21702fb3b48bdd5b7726 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 17 Mar 2026 13:32:37 +0100 Subject: [PATCH 31/38] remove utility class --- usersimcrs/evaluation/dialogue_annotation.py | 36 ++++++++++++ .../reward_per_dialogue_length_metric.py | 8 +-- usersimcrs/evaluation/success_rate_metric.py | 27 ++++----- ...ssful_recommendation_round_ratio_metric.py | 13 ++--- usersimcrs/evaluation/utility_base.py | 56 ------------------- 5 files changed, 59 insertions(+), 81 deletions(-) delete mode 100644 usersimcrs/evaluation/utility_base.py diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py index fbefe441..12e4a573 100644 --- a/usersimcrs/evaluation/dialogue_annotation.py +++ b/usersimcrs/evaluation/dialogue_annotation.py @@ -18,7 +18,12 @@ from usersimcrs.utils.simulation_utils import get_NLU +DEFAULT_REC_LABELS = ["REC-S", "REC-E"] +DEFAULT_ACC_LABELS = ["ACC"] +DEFAULT_REJ_LABELS = ["REJ"] + _intent_cache: Dict[Tuple[str, ...], List[Intent]] = {} +_nlu_cache: Dict[str, NLU] = {} def resolve_intents( @@ -39,6 +44,37 @@ def resolve_intents( return _intent_cache[key] +def annotate_if_needed( + dialogue: Dialogue, + user_nlu_config_path: Optional[str] = None, + agent_nlu_config_path: Optional[str] = None, +) -> None: + """Annotates the dialogue with NLU if config paths are provided. + + NLU modules are loaded lazily and cached by config path. + + Args: + dialogue: Dialogue to annotate. + user_nlu_config_path: Path to user NLU configuration. + agent_nlu_config_path: Path to agent NLU configuration. + """ + if not user_nlu_config_path or not agent_nlu_config_path: + return + if user_nlu_config_path not in _nlu_cache: + _nlu_cache[user_nlu_config_path] = load_nlu( + user_nlu_config_path, "User NLU Configuration" + ) + if agent_nlu_config_path not in _nlu_cache: + _nlu_cache[agent_nlu_config_path] = load_nlu( + agent_nlu_config_path, "Agent NLU Configuration" + ) + annotate_dialogue( + dialogue, + _nlu_cache[user_nlu_config_path], + _nlu_cache[agent_nlu_config_path], + ) + + def annotate_dialogue( dialogue: Dialogue, user_nlu: NLU, agent_nlu: NLU ) -> Dialogue: diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py index e28de982..1fcb30b1 100644 --- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py +++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py @@ -7,14 +7,15 @@ from dialoguekit.core.dialogue import Dialogue from dialoguekit.participant.participant import DialogueParticipant + +from usersimcrs.evaluation.base_metric import BaseMetric from usersimcrs.evaluation.dialogue_annotation import ( + DEFAULT_ACC_LABELS, resolve_intents, ) -from usersimcrs.evaluation.utility_base import DEFAULT_ACC_LABELS, UtilityBase - -class RewardPerDialogueLengthMetric(UtilityBase): +class RewardPerDialogueLengthMetric(BaseMetric): def __init__( self, name: str = "reward_per_dialogue_length", @@ -42,7 +43,6 @@ def evaluate_dialogue( Returns: Ratio of accepted recommendations to total utterances. """ - self._annotate_if_needed(dialogue) acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS) nb_accepted = sum( 1 diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py index f9926206..4d6e9d86 100644 --- a/usersimcrs/evaluation/success_rate_metric.py +++ b/usersimcrs/evaluation/success_rate_metric.py @@ -7,20 +7,19 @@ from dialoguekit.core.dialogue import Dialogue +from usersimcrs.evaluation.base_metric import BaseMetric from usersimcrs.evaluation.dialogue_annotation import ( - get_recommendation_rounds, - is_recommendation_accepted, - resolve_intents, -) -from usersimcrs.evaluation.utility_base import ( DEFAULT_ACC_LABELS, DEFAULT_REC_LABELS, DEFAULT_REJ_LABELS, - UtilityBase, + annotate_if_needed, + get_recommendation_rounds, + is_recommendation_accepted, + resolve_intents, ) -class SuccessRateMetric(UtilityBase): +class SuccessRateMetric(BaseMetric): def __init__( self, user_nlu_config_path: Optional[str] = None, @@ -34,11 +33,9 @@ def __init__( agent_nlu_config_path: Path to agent NLU configuration. name: Metric name. """ - super().__init__( - name, - user_nlu_config_path=user_nlu_config_path, - agent_nlu_config_path=agent_nlu_config_path, - ) + super().__init__(name) + self._user_nlu_config_path = user_nlu_config_path + self._agent_nlu_config_path = agent_nlu_config_path def evaluate_dialogue( self, @@ -62,7 +59,11 @@ def evaluate_dialogue( Returns: 1.0 if at least one recommendation was accepted, 0.0 otherwise. """ - self._annotate_if_needed(dialogue) + annotate_if_needed( + dialogue, + self._user_nlu_config_path, + self._agent_nlu_config_path, + ) rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS) acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS) rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS) diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py index 471527fe..c5fe6fe6 100644 --- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py +++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py @@ -8,20 +8,18 @@ from dialoguekit.core.dialogue import Dialogue +from usersimcrs.evaluation.base_metric import BaseMetric from usersimcrs.evaluation.dialogue_annotation import ( - get_recommendation_rounds, - is_recommendation_accepted, - resolve_intents, -) -from usersimcrs.evaluation.utility_base import ( DEFAULT_ACC_LABELS, DEFAULT_REC_LABELS, DEFAULT_REJ_LABELS, - UtilityBase, + get_recommendation_rounds, + is_recommendation_accepted, + resolve_intents, ) -class SuccessfulRecommendationRoundRatioMetric(UtilityBase): +class SuccessfulRecommendationRoundRatioMetric(BaseMetric): def __init__( self, name: str = "successful_recommendation_round_ratio", @@ -56,7 +54,6 @@ def evaluate_dialogue( Ratio of accepted recommendation rounds to total rounds, or 0.0 if there are no recommendation rounds. """ - self._annotate_if_needed(dialogue) rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS) acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS) rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS) diff --git a/usersimcrs/evaluation/utility_base.py b/usersimcrs/evaluation/utility_base.py deleted file mode 100644 index 7f87a754..00000000 --- a/usersimcrs/evaluation/utility_base.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Base class for utility-centric dialogue evaluation metrics. - -Provides shared NLU loading, and dialogue annotation. -""" - -from abc import ABC -from typing import Optional - -from dialoguekit.core.dialogue import Dialogue -from dialoguekit.nlu.nlu import NLU - -from usersimcrs.evaluation.base_metric import BaseMetric -from usersimcrs.evaluation.dialogue_annotation import ( - annotate_dialogue, - load_nlu, -) - -DEFAULT_REC_LABELS = ["REC-S", "REC-E"] -DEFAULT_ACC_LABELS = ["ACC"] -DEFAULT_REJ_LABELS = ["REJ"] - - -class UtilityBase(BaseMetric, ABC): - def __init__( - self, - name: str, - user_nlu_config_path: Optional[str] = None, - agent_nlu_config_path: Optional[str] = None, - ) -> None: - """Initializes the utility metric. - - Args: - name: Metric name. - user_nlu_config_path: Path to user NLU configuration. - agent_nlu_config_path: Path to agent NLU configuration. - """ - super().__init__(name) - self._user_nlu_config_path = user_nlu_config_path - self._agent_nlu_config_path = agent_nlu_config_path - self._user_nlu: Optional[NLU] = None - self._agent_nlu: Optional[NLU] = None - - def _annotate_if_needed(self, dialogue: Dialogue) -> None: - """Annotates the dialogue with NLU if config paths are set.""" - if self._user_nlu_config_path and self._agent_nlu_config_path: - if self._user_nlu is None: - self._user_nlu = load_nlu( - self._user_nlu_config_path, - "User NLU Configuration", - ) - if self._agent_nlu is None: - self._agent_nlu = load_nlu( - self._agent_nlu_config_path, - "Agent NLU Configuration", - ) - annotate_dialogue(dialogue, self._user_nlu, self._agent_nlu) From e0f3c56e8fa9bc93c7b6ea9c99c5e3996aae844d Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 17 Mar 2026 14:48:06 +0100 Subject: [PATCH 32/38] fix get_recommendation_rounds --- usersimcrs/evaluation/dialogue_annotation.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py index 12e4a573..c6b4ec3c 100644 --- a/usersimcrs/evaluation/dialogue_annotation.py +++ b/usersimcrs/evaluation/dialogue_annotation.py @@ -170,16 +170,21 @@ def get_recommendation_rounds( """ rounds: List[List[AnnotatedUtterance]] = [] current_round: List[AnnotatedUtterance] = [] + in_round = False for utterance in dialogue.utterances: if any( intent in utterance.get_intents() for intent in recommendation_intents ): - if current_round: + if in_round and current_round: rounds.append(current_round) current_round = [utterance] + in_round = True else: - current_round.append(utterance) + if in_round: + current_round.append(utterance) + if in_round and current_round: + rounds.append(current_round) return rounds From 4157fdc56a2b7d0d5d585ec4e5c08cca44f84308 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 17 Mar 2026 17:14:08 +0100 Subject: [PATCH 33/38] fixes --- usersimcrs/evaluation/dialogue_annotation.py | 24 +++++++--- .../reward_per_dialogue_length_metric.py | 20 ++++---- usersimcrs/evaluation/success_rate_metric.py | 48 +++++++------------ ...ssful_recommendation_round_ratio_metric.py | 38 +++++++-------- 4 files changed, 64 insertions(+), 66 deletions(-) diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py index c6b4ec3c..0039594f 100644 --- a/usersimcrs/evaluation/dialogue_annotation.py +++ b/usersimcrs/evaluation/dialogue_annotation.py @@ -44,6 +44,21 @@ def resolve_intents( return _intent_cache[key] +DEFAULT_REC_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_REC_LABELS) +DEFAULT_ACC_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_ACC_LABELS) +DEFAULT_REJ_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_REJ_LABELS) + + +def ensure_dialogue_is_annotated(dialogue: Dialogue) -> None: + """Raises if a dialogue is not annotated with annotated utterances.""" + for utterance in dialogue.utterances: + if not isinstance(utterance, AnnotatedUtterance): + raise ValueError( + "Dialogue must be annotated (utterances must be " + "`AnnotatedUtterance`)." + ) + + def annotate_if_needed( dialogue: Dialogue, user_nlu_config_path: Optional[str] = None, @@ -170,20 +185,17 @@ def get_recommendation_rounds( """ rounds: List[List[AnnotatedUtterance]] = [] current_round: List[AnnotatedUtterance] = [] - in_round = False for utterance in dialogue.utterances: if any( intent in utterance.get_intents() for intent in recommendation_intents ): - if in_round and current_round: + if current_round: rounds.append(current_round) current_round = [utterance] - in_round = True else: - if in_round: - current_round.append(utterance) - if in_round and current_round: + current_round.append(utterance) + if current_round: rounds.append(current_round) return rounds diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py index 1fcb30b1..c08881b1 100644 --- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py +++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py @@ -3,15 +3,15 @@ Evaluates the ratio of accepted recommendations to total dialogue length. """ -from typing import Any, List, Optional +from typing import Any, List from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent from dialoguekit.participant.participant import DialogueParticipant from usersimcrs.evaluation.base_metric import BaseMetric from usersimcrs.evaluation.dialogue_annotation import ( - DEFAULT_ACC_LABELS, - resolve_intents, + ensure_dialogue_is_annotated, ) @@ -23,31 +23,33 @@ def __init__( """Initializes the reward-per-dialogue-length metric. Args: - name: Metric name. + name: Metric name. Defaults to "reward_per_dialogue_length". """ super().__init__(name) def evaluate_dialogue( self, dialogue: Dialogue, - acceptance_intent_labels: Optional[List[str]] = None, + acceptance_intents: List[Intent], **kwargs: Any, ) -> float: """Computes the reward-per-dialogue-length score. Args: dialogue: Dialogue to evaluate. - acceptance_intent_labels: Labels for acceptance intents. - Defaults to ``["ACC"]``. + acceptance_intents: Acceptance intents (e.g., ``[Intent("ACC")]``). Returns: Ratio of accepted recommendations to total utterances. """ - acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS) + ensure_dialogue_is_annotated(dialogue) nb_accepted = sum( 1 for utterance in dialogue.utterances if utterance.participant == DialogueParticipant.USER - and any(intent in acc for intent in utterance.get_intents()) + and any( + intent in acceptance_intents + for intent in utterance.get_intents() + ) ) return nb_accepted / len(dialogue.utterances) diff --git a/usersimcrs/evaluation/success_rate_metric.py b/usersimcrs/evaluation/success_rate_metric.py index 4d6e9d86..9d3dd3bd 100644 --- a/usersimcrs/evaluation/success_rate_metric.py +++ b/usersimcrs/evaluation/success_rate_metric.py @@ -3,71 +3,57 @@ Evaluates whether at least one recommendation was accepted during a dialogue. """ -from typing import Any, List, Optional +from typing import Any, List from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent from usersimcrs.evaluation.base_metric import BaseMetric from usersimcrs.evaluation.dialogue_annotation import ( - DEFAULT_ACC_LABELS, - DEFAULT_REC_LABELS, - DEFAULT_REJ_LABELS, - annotate_if_needed, + ensure_dialogue_is_annotated, get_recommendation_rounds, is_recommendation_accepted, - resolve_intents, ) class SuccessRateMetric(BaseMetric): def __init__( self, - user_nlu_config_path: Optional[str] = None, - agent_nlu_config_path: Optional[str] = None, name: str = "success_rate", ) -> None: """Initializes the success rate metric. Args: - user_nlu_config_path: Path to user NLU configuration. - agent_nlu_config_path: Path to agent NLU configuration. name: Metric name. """ super().__init__(name) - self._user_nlu_config_path = user_nlu_config_path - self._agent_nlu_config_path = agent_nlu_config_path def evaluate_dialogue( self, dialogue: Dialogue, - recommendation_intent_labels: Optional[List[str]] = None, - acceptance_intent_labels: Optional[List[str]] = None, - rejection_intent_labels: Optional[List[str]] = None, + recommendation_intents: List[Intent], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], **kwargs: Any, ) -> float: """Computes the success rate for a single dialogue. Args: dialogue: Dialogue to evaluate. - recommendation_intent_labels: Labels for recommendation intents. - Defaults to ``["REC-S", "REC-E"]``. - acceptance_intent_labels: Labels for acceptance intents. - Defaults to ``["ACC"]``. - rejection_intent_labels: Labels for rejection intents. - Defaults to ``["REJ"]``. + recommendation_intents: Intents that indicate recommendation. + acceptance_intents: Intents that indicate acceptance. + rejection_intents: Intents that indicate rejection. Returns: 1.0 if at least one recommendation was accepted, 0.0 otherwise. """ - annotate_if_needed( - dialogue, - self._user_nlu_config_path, - self._agent_nlu_config_path, - ) - rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS) - acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS) - rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS) - rounds = get_recommendation_rounds(dialogue, rec) + ensure_dialogue_is_annotated(dialogue) + rounds = get_recommendation_rounds(dialogue, recommendation_intents) return float( - any(is_recommendation_accepted(r, acc, rej) for r in rounds) + any( + is_recommendation_accepted( + r, acceptance_intents, rejection_intents + ) + for r in rounds + ) ) diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py index c5fe6fe6..ce2e9b1f 100644 --- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py +++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py @@ -4,18 +4,16 @@ rounds in a dialogue. """ -from typing import Any, List, Optional +from typing import Any, List from dialoguekit.core.dialogue import Dialogue +from dialoguekit.core.intent import Intent from usersimcrs.evaluation.base_metric import BaseMetric from usersimcrs.evaluation.dialogue_annotation import ( - DEFAULT_ACC_LABELS, - DEFAULT_REC_LABELS, - DEFAULT_REJ_LABELS, + ensure_dialogue_is_annotated, get_recommendation_rounds, is_recommendation_accepted, - resolve_intents, ) @@ -27,38 +25,38 @@ def __init__( """Initializes the successful recommendation round ratio metric. Args: - name: Metric name. + name: Metric name. Defaults to + "successful_recommendation_round_ratio". """ super().__init__(name) def evaluate_dialogue( self, dialogue: Dialogue, - recommendation_intent_labels: Optional[List[str]] = None, - acceptance_intent_labels: Optional[List[str]] = None, - rejection_intent_labels: Optional[List[str]] = None, + recommendation_intents: List[Intent], + acceptance_intents: List[Intent], + rejection_intents: List[Intent], **kwargs: Any, ) -> float: """Computes the successful recommendation round ratio. Args: dialogue: Dialogue to evaluate. - recommendation_intent_labels: Labels for recommendation intents. - Defaults to ``["REC-S", "REC-E"]``. - acceptance_intent_labels: Labels for acceptance intents. - Defaults to ``["ACC"]``. - rejection_intent_labels: Labels for rejection intents. - Defaults to ``["REJ"]``. + recommendation_intents: Intents that indicate recommendation. + acceptance_intents: Intents that indicate acceptance. + rejection_intents: Intents that indicate rejection. Returns: Ratio of accepted recommendation rounds to total rounds, or 0.0 if there are no recommendation rounds. """ - rec = resolve_intents(recommendation_intent_labels, DEFAULT_REC_LABELS) - acc = resolve_intents(acceptance_intent_labels, DEFAULT_ACC_LABELS) - rej = resolve_intents(rejection_intent_labels, DEFAULT_REJ_LABELS) - rounds = get_recommendation_rounds(dialogue, rec) + ensure_dialogue_is_annotated(dialogue) + rounds = get_recommendation_rounds(dialogue, recommendation_intents) successful = sum( - 1 for r in rounds if is_recommendation_accepted(r, acc, rej) + 1 + for r in rounds + if is_recommendation_accepted( + r, acceptance_intents, rejection_intents + ) ) return successful / len(rounds) if rounds else 0.0 From 3b068b4a779302bb6f2668311fb76b8b19ab9a68 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 24 Mar 2026 11:55:03 +0300 Subject: [PATCH 34/38] resolve issues --- usersimcrs/evaluation/base_metric.py | 48 ++++++++ usersimcrs/evaluation/dialogue_annotation.py | 107 +++--------------- .../reward_per_dialogue_length_metric.py | 2 +- ...ssful_recommendation_round_ratio_metric.py | 4 +- 4 files changed, 65 insertions(+), 96 deletions(-) create mode 100644 usersimcrs/evaluation/base_metric.py diff --git a/usersimcrs/evaluation/base_metric.py b/usersimcrs/evaluation/base_metric.py new file mode 100644 index 00000000..c99399a2 --- /dev/null +++ b/usersimcrs/evaluation/base_metric.py @@ -0,0 +1,48 @@ +"""Abstract base class for dialogue evaluation metrics.""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, List +from dialoguekit.core.dialogue import Dialogue + + +class BaseMetric(ABC): + def __init__(self, name: str) -> None: + """Initializes the metric. + + Args: + name: Metric name. + """ + self.name = name + + @abstractmethod + def evaluate_dialogue(self, dialogue: Dialogue, **kwargs: Any) -> float: + """Computes the metric for a single dialogue. + + Args: + dialogue: Single dialogue to score. + **kwargs: Additional arguments specific to the metric. + + Raises: + NotImplementedError: When not implemented by a subclass. + + Returns: + Score for the dialogue. + """ + raise NotImplementedError() + + def evaluate_dialogues( + self, dialogues: List[Dialogue], **kwargs: Any + ) -> Dict[str, float]: + """Computes the metric for every dialogue in a given list. + + Args: + dialogues: Dialogues. + **kwargs: Additional arguments specific to the metric. + + Returns: + Dictionary with result per dialogue. Keys are conversation IDs. + """ + return { + dialogue.conversation_id: self.evaluate_dialogue(dialogue, **kwargs) + for dialogue in dialogues + } diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py index 0039594f..2cd879d5 100644 --- a/usersimcrs/evaluation/dialogue_annotation.py +++ b/usersimcrs/evaluation/dialogue_annotation.py @@ -5,7 +5,7 @@ assessing recommendation acceptance. """ -from typing import Dict, List, Optional, Sequence, Tuple +from typing import List from confuse import Configuration @@ -14,80 +14,14 @@ from dialoguekit.core.intent import Intent from dialoguekit.nlu.nlu import NLU from dialoguekit.participant.participant import DialogueParticipant - from usersimcrs.utils.simulation_utils import get_NLU -DEFAULT_REC_LABELS = ["REC-S", "REC-E"] -DEFAULT_ACC_LABELS = ["ACC"] -DEFAULT_REJ_LABELS = ["REJ"] - -_intent_cache: Dict[Tuple[str, ...], List[Intent]] = {} -_nlu_cache: Dict[str, NLU] = {} - - -def resolve_intents( - labels: Optional[Sequence[str]], defaults: List[str] -) -> List[Intent]: - """Resolves optional label overrides to a cached list of Intents. - - Args: - labels: Custom labels or None to use defaults. - defaults: Default label strings. - - Returns: - Cached list of Intent objects. - """ - key = tuple(labels if labels is not None else defaults) - if key not in _intent_cache: - _intent_cache[key] = [Intent(label) for label in key] - return _intent_cache[key] - - -DEFAULT_REC_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_REC_LABELS) -DEFAULT_ACC_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_ACC_LABELS) -DEFAULT_REJ_INTENTS: List[Intent] = resolve_intents(None, DEFAULT_REJ_LABELS) - - def ensure_dialogue_is_annotated(dialogue: Dialogue) -> None: - """Raises if a dialogue is not annotated with annotated utterances.""" + """Raises error if dialogue utterances are not annotated.""" for utterance in dialogue.utterances: if not isinstance(utterance, AnnotatedUtterance): - raise ValueError( - "Dialogue must be annotated (utterances must be " - "`AnnotatedUtterance`)." - ) - - -def annotate_if_needed( - dialogue: Dialogue, - user_nlu_config_path: Optional[str] = None, - agent_nlu_config_path: Optional[str] = None, -) -> None: - """Annotates the dialogue with NLU if config paths are provided. - - NLU modules are loaded lazily and cached by config path. - - Args: - dialogue: Dialogue to annotate. - user_nlu_config_path: Path to user NLU configuration. - agent_nlu_config_path: Path to agent NLU configuration. - """ - if not user_nlu_config_path or not agent_nlu_config_path: - return - if user_nlu_config_path not in _nlu_cache: - _nlu_cache[user_nlu_config_path] = load_nlu( - user_nlu_config_path, "User NLU Configuration" - ) - if agent_nlu_config_path not in _nlu_cache: - _nlu_cache[agent_nlu_config_path] = load_nlu( - agent_nlu_config_path, "Agent NLU Configuration" - ) - annotate_dialogue( - dialogue, - _nlu_cache[user_nlu_config_path], - _nlu_cache[agent_nlu_config_path], - ) + raise RuntimeError("Dialogue must be annotated.") def annotate_dialogue( @@ -96,7 +30,7 @@ def annotate_dialogue( """Annotates utterances with dialogue acts. Each utterance that is not already an AnnotatedUtterance is converted to - one. Utterances that already carry dialogue acts are left untouched. + one. Utterances that already carry dialogue acts are left untouched. Args: dialogue: Dialogue to be annotated. @@ -131,39 +65,26 @@ def annotate_dialogue( return dialogue -def load_nlu( - nlu_config_path: str, - config_name: str = "NLU Configuration", -) -> NLU: - """Loads a single NLU module from the given configuration file. - - Args: - nlu_config_path: Path to the NLU configuration file. - config_name: Name for the Configuration instance. Defaults to - ``"NLU Configuration"``. - - Returns: - NLU module. - """ - nlu_config = Configuration(config_name) - nlu_config.set_file(nlu_config_path) - return get_NLU(nlu_config) - - def annotate_dialogues( dialogues: List[Dialogue], user_nlu_config_path: str, agent_nlu_config_path: str, ) -> None: - """Annotates a batch of dialogues in place, loading NLU modules once. + """Annotates dialogues in place using NLU modules loaded once. Args: dialogues: Dialogues to annotate (modified in place). user_nlu_config_path: Path to user NLU configuration file. agent_nlu_config_path: Path to agent NLU configuration file. """ - user_nlu = load_nlu(user_nlu_config_path, "User NLU Configuration") - agent_nlu = load_nlu(agent_nlu_config_path, "Agent NLU Configuration") + user_nlu_config = Configuration("User NLU Configuration") + user_nlu_config.set_file(user_nlu_config_path) + user_nlu = get_NLU(user_nlu_config) + + agent_nlu_config = Configuration("Agent NLU Configuration") + agent_nlu_config.set_file(agent_nlu_config_path) + agent_nlu = get_NLU(agent_nlu_config) + for dialogue in dialogues: annotate_dialogue(dialogue, user_nlu, agent_nlu) @@ -174,7 +95,7 @@ def get_recommendation_rounds( """Splits a dialogue into recommendation rounds. A new round begins each time an utterance contains a recommendation - intent. + intent. Args: dialogue: Annotated dialogue. diff --git a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py index c08881b1..ecabb410 100644 --- a/usersimcrs/evaluation/reward_per_dialogue_length_metric.py +++ b/usersimcrs/evaluation/reward_per_dialogue_length_metric.py @@ -37,7 +37,7 @@ def evaluate_dialogue( Args: dialogue: Dialogue to evaluate. - acceptance_intents: Acceptance intents (e.g., ``[Intent("ACC")]``). + acceptance_intents: Acceptance intents. Returns: Ratio of accepted recommendations to total utterances. diff --git a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py index ce2e9b1f..a544696d 100644 --- a/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py +++ b/usersimcrs/evaluation/successful_recommendation_round_ratio_metric.py @@ -26,7 +26,7 @@ def __init__( Args: name: Metric name. Defaults to - "successful_recommendation_round_ratio". + "successful_recommendation_round_ratio". """ super().__init__(name) @@ -48,7 +48,7 @@ def evaluate_dialogue( Returns: Ratio of accepted recommendation rounds to total rounds, - or 0.0 if there are no recommendation rounds. + or 0.0 if there are no recommendation rounds. """ ensure_dialogue_is_annotated(dialogue) rounds = get_recommendation_rounds(dialogue, recommendation_intents) From 89446219dfa74d31a214e2e221a9edc143dbd39b Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 24 Mar 2026 12:37:39 +0300 Subject: [PATCH 35/38] fixes --- usersimcrs/evaluation/dialogue_annotation.py | 23 +++++--------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/usersimcrs/evaluation/dialogue_annotation.py b/usersimcrs/evaluation/dialogue_annotation.py index 2cd879d5..58ffb2c9 100644 --- a/usersimcrs/evaluation/dialogue_annotation.py +++ b/usersimcrs/evaluation/dialogue_annotation.py @@ -7,14 +7,11 @@ from typing import List -from confuse import Configuration - from dialoguekit.core.annotated_utterance import AnnotatedUtterance from dialoguekit.core.dialogue import Dialogue from dialoguekit.core.intent import Intent from dialoguekit.nlu.nlu import NLU from dialoguekit.participant.participant import DialogueParticipant -from usersimcrs.utils.simulation_utils import get_NLU def ensure_dialogue_is_annotated(dialogue: Dialogue) -> None: @@ -30,7 +27,7 @@ def annotate_dialogue( """Annotates utterances with dialogue acts. Each utterance that is not already an AnnotatedUtterance is converted to - one. Utterances that already carry dialogue acts are left untouched. + one. Utterances that already carry dialogue acts are left untouched. Args: dialogue: Dialogue to be annotated. @@ -67,24 +64,16 @@ def annotate_dialogue( def annotate_dialogues( dialogues: List[Dialogue], - user_nlu_config_path: str, - agent_nlu_config_path: str, + user_nlu: NLU, + agent_nlu: NLU, ) -> None: - """Annotates dialogues in place using NLU modules loaded once. + """Annotates dialogues in place using provided NLU modules. Args: dialogues: Dialogues to annotate (modified in place). - user_nlu_config_path: Path to user NLU configuration file. - agent_nlu_config_path: Path to agent NLU configuration file. + user_nlu: NLU module for user utterances. + agent_nlu: NLU module for agent utterances. """ - user_nlu_config = Configuration("User NLU Configuration") - user_nlu_config.set_file(user_nlu_config_path) - user_nlu = get_NLU(user_nlu_config) - - agent_nlu_config = Configuration("Agent NLU Configuration") - agent_nlu_config.set_file(agent_nlu_config_path) - agent_nlu = get_NLU(agent_nlu_config) - for dialogue in dialogues: annotate_dialogue(dialogue, user_nlu, agent_nlu) From 8be81e42bf65d8f03e836f5bf9b8cf6ce571339a Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 24 Mar 2026 17:59:40 +0300 Subject: [PATCH 36/38] 234-create-main-evaluation-script add eval script --- config/default/config_evaluation.yaml | 32 +++ usersimcrs/evaluation/main.py | 299 ---------------------- usersimcrs/run_evaluation.py | 351 ++++++++++++++++++++++++++ 3 files changed, 383 insertions(+), 299 deletions(-) create mode 100644 config/default/config_evaluation.yaml delete mode 100644 usersimcrs/evaluation/main.py create mode 100644 usersimcrs/run_evaluation.py diff --git a/config/default/config_evaluation.yaml b/config/default/config_evaluation.yaml new file mode 100644 index 00000000..455fc3c0 --- /dev/null +++ b/config/default/config_evaluation.yaml @@ -0,0 +1,32 @@ +dialogues: data/datasets/moviebot/annotated_dialogues.json +metrics: + - satisfaction + - success_rate + - successful_recommendation_round_ratio + - reward_per_dialogue_length +output: data/evaluation/moviebot_non_quality_results.json + +quality_llm_interface: + llm_interface_class_path: "usersimcrs.llm_interfaces.ollama_interface.OllamaLLMInterface" + llm_interface_args: + configuration_path: config/llm_interface/config_ollama_default.yaml + default_response: "" +quality_aspects: + - REC_RELEVANCE + - COM_STYLE + - FLUENCY + - CONV_FLOW + - OVERALL_SAT + +user_nlu_config: config/default/config_default.yaml +agent_nlu_config: config/default/config_default.yaml + +recommendation_intent_labels: + - REVEAL + - REVEAL.SIMILAR + - REVEAL.NONE + - REVEAL.REVISE +accept_intent_labels: + - NOTE.ACCEPT +reject_intent_labels: + - NOTE.DISLIKE \ No newline at end of file diff --git a/usersimcrs/evaluation/main.py b/usersimcrs/evaluation/main.py deleted file mode 100644 index fa1cf048..00000000 --- a/usersimcrs/evaluation/main.py +++ /dev/null @@ -1,299 +0,0 @@ -"""Unified script for evaluating dialogues with selected metrics.""" - -import argparse -import json -import os -from collections import defaultdict -from statistics import mean, stdev -from typing import Any, Dict, List, Mapping, Sequence - -from dialoguekit.core.dialogue import Dialogue -from dialoguekit.nlu.models.satisfaction_classifier import ( - SatisfactionClassifierSVM, -) -from dialoguekit.utils.dialogue_reader import json_to_dialogues - -from usersimcrs.evaluation.base_metric import BaseMetric -from usersimcrs.evaluation.dialogue_annotation import annotate_dialogues -from usersimcrs.evaluation.quality_metric import QualityMetric -from usersimcrs.evaluation.quality_rubrics import QualityRubrics -from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric -from usersimcrs.evaluation.reward_per_dialogue_length_metric import ( - RewardPerDialogueLengthMetric, -) -from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric -from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import ( - SuccessfulRecommendationRoundRatioMetric, -) -from usersimcrs.llm_interfaces.ollama_interface import OllamaLLMInterface - -UTILITY_METRICS = { - "success_rate", - "successful_recommendation_round_ratio", - "reward_per_dialogue_length", -} - -SUPPORTED_METRICS = [ - "quality", - "satisfaction", - "success_rate", - "successful_recommendation_round_ratio", - "reward_per_dialogue_length", -] - - -def parse_args() -> argparse.Namespace: - """Parses command-line arguments.""" - parser = argparse.ArgumentParser(prog="usersimcrs.evaluation.main") - parser.add_argument( - "--dialogues", - type=str, - required=True, - help="Path to the dialogues JSON file.", - ) - parser.add_argument( - "--metrics", - nargs="+", - required=True, - choices=SUPPORTED_METRICS, - help="List of metrics to compute.", - ) - parser.add_argument( - "--output", - type=str, - required=True, - help="Path to save evaluation results as JSON.", - ) - parser.add_argument( - "--ollama_config", - type=str, - help="Path to Ollama config file (required when quality is selected).", - ) - parser.add_argument( - "--quality_aspects", - nargs="+", - default=[aspect.name for aspect in QualityRubrics], - help=( - "Quality aspects to evaluate. " - "Defaults to all aspects in QualityRubrics." - ), - ) - parser.add_argument( - "--user_nlu_config", - type=str, - help=( - "Path to user NLU config (required for utility metrics: " - "success_rate, successful_recommendation_round_ratio, " - "reward_per_dialogue_length)." - ), - ) - parser.add_argument( - "--agent_nlu_config", - type=str, - help=( - "Path to agent NLU config (required for utility metrics: " - "success_rate, successful_recommendation_round_ratio, " - "reward_per_dialogue_length)." - ), - ) - parser.add_argument( - "--reject_intent_labels", - nargs="+", - default=["REJ"], - help="Intent labels corresponding to rejection.", - ) - parser.add_argument( - "--accept_intent_labels", - nargs="+", - default=["ACC"], - help="Intent labels corresponding to acceptance.", - ) - parser.add_argument( - "--recommendation_intent_labels", - nargs="+", - default=["REC-S", "REC-E"], - help="Intent labels corresponding to recommendation.", - ) - return parser.parse_args() - - -def _validate_args(args: argparse.Namespace) -> None: - """Validates metric-specific CLI requirements.""" - if "quality" in args.metrics and not args.ollama_config: - raise ValueError( - "The --ollama_config argument is required when using quality." - ) - - if UTILITY_METRICS.intersection(set(args.metrics)): - if not args.user_nlu_config or not args.agent_nlu_config: - raise ValueError( - "Both --user_nlu_config and --agent_nlu_config are required " - "for utility metrics." - ) - - supported_aspect_names = [aspect.name for aspect in QualityRubrics] - invalid_aspects = [ - aspect - for aspect in args.quality_aspects - if aspect not in supported_aspect_names - ] - if invalid_aspects: - raise ValueError( - f"Unknown quality aspect(s): {invalid_aspects}. " - f"Supported aspects: {supported_aspect_names}" - ) - - -def _build_metric_registry(args: argparse.Namespace) -> Dict[str, BaseMetric]: - """Builds metric instances keyed by metric name.""" - registry: Dict[str, BaseMetric] = {} - if "quality" in args.metrics: - llm_interface = OllamaLLMInterface( - configuration_path=args.ollama_config, - default_response="", - ) - registry["quality"] = QualityMetric(llm_interface=llm_interface) - if "satisfaction" in args.metrics: - registry["satisfaction"] = SatisfactionMetric( - classifier=SatisfactionClassifierSVM() - ) - if "success_rate" in args.metrics: - registry["success_rate"] = SuccessRateMetric() - if "successful_recommendation_round_ratio" in args.metrics: - registry[ - "successful_recommendation_round_ratio" - ] = SuccessfulRecommendationRoundRatioMetric() - if "reward_per_dialogue_length" in args.metrics: - registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric() - return registry - - -def _summarize_by_agent( - dialogues: Sequence[Dialogue], scores: Mapping[str, float] -) -> Dict[str, Dict[str, float]]: - """Returns aggregate statistics by agent.""" - conversation_to_agent = { - dialogue.conversation_id: dialogue.agent_id for dialogue in dialogues - } - grouped_scores: Dict[str, List[float]] = defaultdict(list) - for conversation_id, score in scores.items(): - agent_id = conversation_to_agent.get(conversation_id, "unknown") - grouped_scores[agent_id].append(score) - - summary: Dict[str, Dict[str, float]] = {} - for agent_id, agent_scores in grouped_scores.items(): - summary[agent_id] = { - "count": float(len(agent_scores)), - "min": min(agent_scores), - "max": max(agent_scores), - "mean": mean(agent_scores), - "stdev": stdev(agent_scores) if len(agent_scores) > 1 else 0.0, - } - return summary - - -def _evaluate_metric( - metric_name: str, - metric: BaseMetric, - dialogues: Sequence[Dialogue], - args: argparse.Namespace, -) -> Dict[str, object]: - """Runs one metric and returns per-dialogue scores and summary.""" - if metric_name == "quality": - per_aspect: Dict[str, Dict[str, Any]] = {} - for aspect in args.quality_aspects: - per_dialogue = metric.evaluate_dialogues( - list(dialogues), - aspect=aspect, - ) - per_aspect[aspect] = { - "per_dialogue": per_dialogue, - "summary_by_agent": _summarize_by_agent( - dialogues, per_dialogue - ), - } - return {"aspects": per_aspect} - - eval_kwargs = {} - if metric_name in UTILITY_METRICS: - eval_kwargs = { - "recommendation_intent_labels": args.recommendation_intent_labels, - "acceptance_intent_labels": args.accept_intent_labels, - "rejection_intent_labels": args.reject_intent_labels, - } - - per_dialogue_scores = metric.evaluate_dialogues( - list(dialogues), **eval_kwargs - ) - return { - "per_dialogue": per_dialogue_scores, - "summary_by_agent": _summarize_by_agent(dialogues, per_dialogue_scores), - } - - -def _print_brief_summary(results: Mapping[str, object]) -> None: - """Prints a concise summary in the terminal.""" - metric_results = results.get("metrics", {}) - if not isinstance(metric_results, dict): - return - for metric_name, metric_result in metric_results.items(): - print(f"Metric: {metric_name}") - if metric_name == "quality": - aspects = metric_result.get("aspects", {}) - for aspect_name, aspect_result in aspects.items(): - print(f" Aspect: {aspect_name}") - for agent_id, stats in aspect_result[ - "summary_by_agent" - ].items(): - print( - f" Agent: {agent_id} | mean={stats['mean']:.3f} " - f"stdev={stats['stdev']:.3f}" - ) - continue - - for agent_id, stats in metric_result["summary_by_agent"].items(): - print( - f" Agent: {agent_id} | mean={stats['mean']:.3f} " - f"stdev={stats['stdev']:.3f}" - ) - - -def main() -> None: - args = parse_args() - _validate_args(args) - - dialogues = json_to_dialogues(args.dialogues) - - if UTILITY_METRICS.intersection(set(args.metrics)): - annotate_dialogues( - dialogues, args.user_nlu_config, args.agent_nlu_config - ) - - metric_registry = _build_metric_registry(args) - - results: Dict[str, Any] = { - "dialogues_path": args.dialogues, - "metrics_requested": args.metrics, - "metrics": {}, - } - - for metric_name in args.metrics: - metric = metric_registry[metric_name] - results["metrics"][metric_name] = _evaluate_metric( - metric_name, - metric, - dialogues, - args, - ) - - output_dir = os.path.dirname(args.output) - if output_dir: - os.makedirs(output_dir, exist_ok=True) - with open(args.output, "w") as f: - json.dump(results, f, indent=2) - - _print_brief_summary(results) - - -if __name__ == "__main__": - main() diff --git a/usersimcrs/run_evaluation.py b/usersimcrs/run_evaluation.py new file mode 100644 index 00000000..706de52a --- /dev/null +++ b/usersimcrs/run_evaluation.py @@ -0,0 +1,351 @@ +"""Console application for running evaluation.""" + +import argparse +import json +import os +from collections import defaultdict +from statistics import mean, stdev +from typing import Any, Dict, List, Mapping, Sequence + +import confuse +from dialoguekit.core.intent import Intent +from dialoguekit.nlu.models.satisfaction_classifier import ( + SatisfactionClassifierSVM, +) +from dialoguekit.utils.dialogue_reader import json_to_dialogues + +from usersimcrs.evaluation.dialogue_annotation import annotate_dialogues +from usersimcrs.evaluation.quality_metric import QualityMetric +from usersimcrs.evaluation.quality_rubrics import QualityRubrics +from usersimcrs.evaluation.reward_per_dialogue_length_metric import ( + RewardPerDialogueLengthMetric, +) +from usersimcrs.evaluation.satisfaction_metric import SatisfactionMetric +from usersimcrs.evaluation.success_rate_metric import SuccessRateMetric +from usersimcrs.evaluation.successful_recommendation_round_ratio_metric import ( + SuccessfulRecommendationRoundRatioMetric, +) +from usersimcrs.utils.simulation_utils import get_NLU, get_llm_interface + +DEFAULT_CONFIG_PATH = "config/default/config_evaluation.yaml" +UTILITY_METRICS = { + "success_rate", + "successful_recommendation_round_ratio", + "reward_per_dialogue_length", +} +SUPPORTED_METRICS = [ + "quality", + "satisfaction", + "success_rate", + "successful_recommendation_round_ratio", + "reward_per_dialogue_length", +] + + +def parse_args() -> argparse.Namespace: + """Defines accepted arguments and returns the parsed values.""" + parser = argparse.ArgumentParser(prog="run_evaluation.py") + parser.add_argument( + "-c", + "--config-file", + help=( + "Path to configuration file to overwrite default values. " + "Defaults to None." + ), + ) + parser.add_argument("--dialogues", type=str, help="Dialogues JSON file.") + parser.add_argument( + "--metrics", + nargs="+", + choices=SUPPORTED_METRICS, + help="Metrics to compute.", + ) + parser.add_argument( + "--output", + type=str, + help="Path to save evaluation results as JSON.", + ) + parser.add_argument( + "--quality_aspects", + nargs="+", + help="Quality aspects to evaluate.", + ) + parser.add_argument( + "--user_nlu_config", + type=str, + help="User NLU configuration file.", + ) + parser.add_argument( + "--agent_nlu_config", + type=str, + help="Agent NLU configuration file.", + ) + parser.add_argument( + "--reject_intent_labels", + nargs="+", + help="Intent labels corresponding to rejection.", + ) + parser.add_argument( + "--accept_intent_labels", + nargs="+", + help="Intent labels corresponding to acceptance.", + ) + parser.add_argument( + "--recommendation_intent_labels", + nargs="+", + help="Intent labels corresponding to recommendation.", + ) + parser.add_argument( + "-d", + "--debug", + action="store_const", + const=True, + help="Debug mode.", + ) + return parser.parse_args() + + +def load_config(args: argparse.Namespace) -> confuse.Configuration: + """Loads config from default file, custom file, and CLI overrides.""" + config = confuse.Configuration("usersimcrs") + config.set_file(DEFAULT_CONFIG_PATH) + if args.config_file: + config.set_file(args.config_file) + config.set_args(args, dots=True) + return config + + +def validate_config(config: confuse.Configuration) -> List[str]: + """Validates evaluation config and returns quality aspects.""" + metrics = config["metrics"].get() + if "quality" in metrics and "quality_llm_interface" not in config: + raise ValueError("Quality evaluation requires `quality_llm_interface`.") + + quality_aspects = config["quality_aspects"].get() + supported_aspects = [aspect.name for aspect in QualityRubrics] + invalid_aspects = [ + aspect for aspect in quality_aspects if aspect not in supported_aspects + ] + if invalid_aspects: + raise ValueError( + f"Unknown quality aspect(s): {invalid_aspects}. " + f"Supported aspects: {supported_aspects}" + ) + + if UTILITY_METRICS.intersection(set(metrics)): + if not config["user_nlu_config"].get(None): + raise ValueError( + "`user_nlu_config` is required for utility metrics." + ) + if not config["agent_nlu_config"].get(None): + raise ValueError( + "`agent_nlu_config` is required for utility metrics." + ) + + return quality_aspects + + +def load_nlu(config_path: str, name: str) -> Any: + """Loads one NLU component from a config path.""" + nlu_config = confuse.Configuration(name) + nlu_config.set_file(config_path) + return get_NLU(nlu_config) + + +def annotate_for_utility( + dialogues: List[Any], config: confuse.Configuration, metrics: Sequence[str] +) -> None: + """Annotates dialogues when utility metrics are requested.""" + if not UTILITY_METRICS.intersection(set(metrics)): + return + + user_nlu = load_nlu( + config["user_nlu_config"].get(), "User NLU Configuration" + ) + agent_nlu = load_nlu( + config["agent_nlu_config"].get(), "Agent NLU Configuration" + ) + annotate_dialogues(dialogues, user_nlu, agent_nlu) + + +def get_summary_by_agent( + dialogues: Sequence[Any], scores: Mapping[str, float] +) -> Dict[str, Dict[str, float]]: + """Aggregates metric scores by agent.""" + grouped_scores: Dict[str, List[float]] = defaultdict(list) + for dialogue in dialogues: + grouped_scores[dialogue.agent_id].append( + scores[dialogue.conversation_id] + ) + + return { + agent_id: { + "count": len(agent_scores), + "min": min(agent_scores), + "max": max(agent_scores), + "mean": mean(agent_scores), + "stdev": stdev(agent_scores) if len(agent_scores) > 1 else 0.0, + } + for agent_id, agent_scores in grouped_scores.items() + } + + +def get_utility_intents( + config: confuse.Configuration, +) -> Dict[str, List[Intent]]: + """Builds intent lists used by utility metrics.""" + return { + "recommendation_intents": [ + Intent(label) + for label in config["recommendation_intent_labels"].get() + ], + "acceptance_intents": [ + Intent(label) for label in config["accept_intent_labels"].get() + ], + "rejection_intents": [ + Intent(label) for label in config["reject_intent_labels"].get() + ], + } + + +def build_metric_registry( + config: confuse.Configuration, metrics: Sequence[str] +) -> Dict[str, Any]: + """Builds metric instances.""" + registry: Dict[str, Any] = {} + if "quality" in metrics: + registry["quality"] = QualityMetric( + llm_interface=get_llm_interface( + config["quality_llm_interface"].get() + ) + ) + if "satisfaction" in metrics: + registry["satisfaction"] = SatisfactionMetric( + classifier=SatisfactionClassifierSVM() + ) + if "success_rate" in metrics: + registry["success_rate"] = SuccessRateMetric() + if "successful_recommendation_round_ratio" in metrics: + registry[ + "successful_recommendation_round_ratio" + ] = SuccessfulRecommendationRoundRatioMetric() + if "reward_per_dialogue_length" in metrics: + registry["reward_per_dialogue_length"] = RewardPerDialogueLengthMetric() + return registry + + +def evaluate_metric( + metric_name: str, + metric: Any, + dialogues: List[Any], + quality_aspects: Sequence[str], + utility_intents: Dict[str, List[Intent]], +) -> Dict[str, Any]: + """Evaluates one metric and returns serialized results.""" + if metric_name == "quality": + return { + "aspects": { + aspect: { + "per_dialogue": scores, + "summary_by_agent": get_summary_by_agent(dialogues, scores), + } + for aspect in quality_aspects + for scores in [ + metric.evaluate_dialogues(dialogues, aspect=aspect) + ] + } + } + + if metric_name in { + "success_rate", + "successful_recommendation_round_ratio", + }: + scores = metric.evaluate_dialogues(dialogues, **utility_intents) + elif metric_name == "reward_per_dialogue_length": + scores = metric.evaluate_dialogues( + dialogues, + acceptance_intents=utility_intents["acceptance_intents"], + ) + else: + scores = metric.evaluate_dialogues(dialogues) + + return { + "per_dialogue": scores, + "summary_by_agent": get_summary_by_agent(dialogues, scores), + } + + +def save_results( + config: confuse.Configuration, results: Dict[str, Any] +) -> None: + """Writes config dump and evaluation results to disk.""" + output_path = config["output"].get() + output_dir = os.path.dirname(output_path) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + output_stem, _ = os.path.splitext(output_path) + with open(f"{output_stem}.meta.yaml", "w") as f: + f.write(config.dump()) + + with open(output_path, "w") as f: + json.dump(results, f, indent=2) + + +def print_summary(results: Mapping[str, Any]) -> None: + """Prints a concise terminal summary.""" + for metric_name, metric_result in results["metrics"].items(): + print(f"Metric: {metric_name}") + if metric_name == "quality": + for aspect_name, aspect_result in metric_result["aspects"].items(): + print(f" Aspect: {aspect_name}") + for agent_id, stats in aspect_result[ + "summary_by_agent" + ].items(): + print( + f" Agent: {agent_id} | mean={stats['mean']:.3f} " + f"stdev={stats['stdev']:.3f}" + ) + continue + + for agent_id, stats in metric_result["summary_by_agent"].items(): + print( + f" Agent: {agent_id} | mean={stats['mean']:.3f} " + f"stdev={stats['stdev']:.3f}" + ) + + +def main() -> None: + """Runs evaluation based on the resolved configuration.""" + args = parse_args() + config = load_config(args) + + metrics = config["metrics"].get() + quality_aspects = validate_config(config) + dialogues = json_to_dialogues(config["dialogues"].get()) + annotate_for_utility(dialogues, config, metrics) + + utility_intents = get_utility_intents(config) + metric_registry = build_metric_registry(config, metrics) + + results: Dict[str, Any] = { + "dialogues_path": config["dialogues"].get(), + "metrics_requested": metrics, + "metrics": {}, + } + + for metric_name in metrics: + results["metrics"][metric_name] = evaluate_metric( + metric_name, + metric_registry[metric_name], + dialogues, + quality_aspects, + utility_intents, + ) + + save_results(config, results) + print_summary(results) + + +if __name__ == "__main__": + main() From b3cd18fa34457b6574c7c774951eb7b82c42f232 Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 14 Apr 2026 17:04:14 +0200 Subject: [PATCH 37/38] fixes --- config/default/config_evaluation.yaml | 11 +- usersimcrs/run_evaluation.py | 225 ++++++++++++++++---------- 2 files changed, 151 insertions(+), 85 deletions(-) diff --git a/config/default/config_evaluation.yaml b/config/default/config_evaluation.yaml index 455fc3c0..74a60df1 100644 --- a/config/default/config_evaluation.yaml +++ b/config/default/config_evaluation.yaml @@ -4,7 +4,7 @@ metrics: - success_rate - successful_recommendation_round_ratio - reward_per_dialogue_length -output: data/evaluation/moviebot_non_quality_results.json +output: data/evaluation/moviebot_non_quality_results quality_llm_interface: llm_interface_class_path: "usersimcrs.llm_interfaces.ollama_interface.OllamaLLMInterface" @@ -18,8 +18,11 @@ quality_aspects: - CONV_FLOW - OVERALL_SAT -user_nlu_config: config/default/config_default.yaml -agent_nlu_config: config/default/config_default.yaml +annotate_dialogues: False +user_nlu: + type: "cosine" +agent_nlu: + type: "cosine" recommendation_intent_labels: - REVEAL @@ -29,4 +32,4 @@ recommendation_intent_labels: accept_intent_labels: - NOTE.ACCEPT reject_intent_labels: - - NOTE.DISLIKE \ No newline at end of file + - NOTE.DISLIKE diff --git a/usersimcrs/run_evaluation.py b/usersimcrs/run_evaluation.py index 706de52a..1e1316b3 100644 --- a/usersimcrs/run_evaluation.py +++ b/usersimcrs/run_evaluation.py @@ -5,15 +5,17 @@ import os from collections import defaultdict from statistics import mean, stdev -from typing import Any, Dict, List, Mapping, Sequence +from typing import Any, Dict, List import confuse +from dialoguekit.core.dialogue import Dialogue from dialoguekit.core.intent import Intent from dialoguekit.nlu.models.satisfaction_classifier import ( SatisfactionClassifierSVM, ) from dialoguekit.utils.dialogue_reader import json_to_dialogues +from usersimcrs.evaluation.base_metric import BaseMetric from usersimcrs.evaluation.dialogue_annotation import annotate_dialogues from usersimcrs.evaluation.quality_metric import QualityMetric from usersimcrs.evaluation.quality_rubrics import QualityRubrics @@ -28,11 +30,6 @@ from usersimcrs.utils.simulation_utils import get_NLU, get_llm_interface DEFAULT_CONFIG_PATH = "config/default/config_evaluation.yaml" -UTILITY_METRICS = { - "success_rate", - "successful_recommendation_round_ratio", - "reward_per_dialogue_length", -} SUPPORTED_METRICS = [ "quality", "satisfaction", @@ -43,7 +40,11 @@ def parse_args() -> argparse.Namespace: - """Defines accepted arguments and returns the parsed values.""" + """Defines accepted arguments and returns the parsed values. + + Returns: + Parsed command-line arguments. + """ parser = argparse.ArgumentParser(prog="run_evaluation.py") parser.add_argument( "-c", @@ -63,7 +64,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--output", type=str, - help="Path to save evaluation results as JSON.", + help="Directory to save evaluation results and metadata.", ) parser.add_argument( "--quality_aspects", @@ -71,14 +72,10 @@ def parse_args() -> argparse.Namespace: help="Quality aspects to evaluate.", ) parser.add_argument( - "--user_nlu_config", - type=str, - help="User NLU configuration file.", - ) - parser.add_argument( - "--agent_nlu_config", - type=str, - help="Agent NLU configuration file.", + "--annotate_dialogues", + action="store_const", + const=True, + help="Annotate dialogues before computing metrics.", ) parser.add_argument( "--reject_intent_labels", @@ -106,17 +103,43 @@ def parse_args() -> argparse.Namespace: def load_config(args: argparse.Namespace) -> confuse.Configuration: - """Loads config from default file, custom file, and CLI overrides.""" + """Loads config from default file, custom file, and CLI overrides. + + Args: + args: Arguments parsed with argparse. + + Returns: + Resolved evaluation configuration. + """ config = confuse.Configuration("usersimcrs") config.set_file(DEFAULT_CONFIG_PATH) if args.config_file: config.set_file(args.config_file) config.set_args(args, dots=True) + + output_dir = config["output"].get() + output_stem, output_extension = os.path.splitext(output_dir) + if output_extension: + output_dir = output_stem + os.makedirs(output_dir, exist_ok=True) + with open(os.path.join(output_dir, "config.meta.yaml"), "w") as f: + f.write(config.dump()) + return config -def validate_config(config: confuse.Configuration) -> List[str]: - """Validates evaluation config and returns quality aspects.""" +def validate_config(config: confuse.Configuration) -> None: + """Validates evaluation config. + + Args: + config: Configuration generated from YAML configuration file. + + Raises: + ValueError: If quality evaluation is requested without an LLM + interface, if an unknown quality aspect is configured, or if + dialogue annotation is requested without user and agent NLU + sections. + """ metrics = config["metrics"].get() if "quality" in metrics and "quality_llm_interface" not in config: raise ValueError("Quality evaluation requires `quality_llm_interface`.") @@ -132,46 +155,69 @@ def validate_config(config: confuse.Configuration) -> List[str]: f"Supported aspects: {supported_aspects}" ) - if UTILITY_METRICS.intersection(set(metrics)): - if not config["user_nlu_config"].get(None): + if config["annotate_dialogues"].get(): + if not config["user_nlu"].get(None): raise ValueError( - "`user_nlu_config` is required for utility metrics." + "`user_nlu` is required when `annotate_dialogues` is True." ) - if not config["agent_nlu_config"].get(None): + if not config["agent_nlu"].get(None): raise ValueError( - "`agent_nlu_config` is required for utility metrics." + "`agent_nlu` is required when `annotate_dialogues` is True." ) - return quality_aspects +def load_nlu( + config: confuse.Configuration, nlu_config_key: str, name: str +) -> Any: + """Loads one NLU component from an evaluation config section. -def load_nlu(config_path: str, name: str) -> Any: - """Loads one NLU component from a config path.""" + Args: + config: Evaluation configuration. + nlu_config_key: Name of the NLU section to load. + name: Name for the temporary NLU configuration. + + Returns: + NLU component. + """ nlu_config = confuse.Configuration(name) - nlu_config.set_file(config_path) + nlu_config.set( + { + "dialogues": config["dialogues"].get(), + "nlu": config[nlu_config_key].get(), + } + ) return get_NLU(nlu_config) -def annotate_for_utility( - dialogues: List[Any], config: confuse.Configuration, metrics: Sequence[str] +def annotate_for_metrics( + dialogues: List[Dialogue], config: confuse.Configuration ) -> None: - """Annotates dialogues when utility metrics are requested.""" - if not UTILITY_METRICS.intersection(set(metrics)): + """Annotates dialogues when requested by configuration. + + Args: + dialogues: Dialogues to annotate in place. + config: Evaluation configuration. + """ + if not config["annotate_dialogues"].get(): return - user_nlu = load_nlu( - config["user_nlu_config"].get(), "User NLU Configuration" - ) - agent_nlu = load_nlu( - config["agent_nlu_config"].get(), "Agent NLU Configuration" - ) + user_nlu = load_nlu(config, "user_nlu", "User NLU Configuration") + agent_nlu = load_nlu(config, "agent_nlu", "Agent NLU Configuration") annotate_dialogues(dialogues, user_nlu, agent_nlu) def get_summary_by_agent( - dialogues: Sequence[Any], scores: Mapping[str, float] + dialogues: List[Dialogue], scores: Dict[str, float] ) -> Dict[str, Dict[str, float]]: - """Aggregates metric scores by agent.""" + """Aggregates metric scores by agent. + + Args: + dialogues: Evaluated dialogues. + scores: Per-dialogue scores keyed by conversation ID. + + Returns: + Descriptive score statistics keyed by agent ID. + """ grouped_scores: Dict[str, List[float]] = defaultdict(list) for dialogue in dialogues: grouped_scores[dialogue.agent_id].append( @@ -193,7 +239,14 @@ def get_summary_by_agent( def get_utility_intents( config: confuse.Configuration, ) -> Dict[str, List[Intent]]: - """Builds intent lists used by utility metrics.""" + """Builds intent lists used by utility metrics. + + Args: + config: Evaluation configuration. + + Returns: + Utility intent lists keyed by metric argument name. + """ return { "recommendation_intents": [ Intent(label) @@ -209,10 +262,18 @@ def get_utility_intents( def build_metric_registry( - config: confuse.Configuration, metrics: Sequence[str] -) -> Dict[str, Any]: - """Builds metric instances.""" - registry: Dict[str, Any] = {} + config: confuse.Configuration, metrics: List[str] +) -> Dict[str, BaseMetric]: + """Builds metric instances. + + Args: + config: Evaluation configuration. + metrics: Names of metrics to evaluate. + + Returns: + Metric instances keyed by metric name. + """ + registry: Dict[str, BaseMetric] = {} if "quality" in metrics: registry["quality"] = QualityMetric( llm_interface=get_llm_interface( @@ -236,25 +297,32 @@ def build_metric_registry( def evaluate_metric( metric_name: str, - metric: Any, - dialogues: List[Any], - quality_aspects: Sequence[str], + metric: BaseMetric, + dialogues: List[Dialogue], + quality_aspects: List[str], utility_intents: Dict[str, List[Intent]], ) -> Dict[str, Any]: - """Evaluates one metric and returns serialized results.""" + """Evaluates one metric and returns serialized results. + + Args: + metric_name: Name of the metric to evaluate. + metric: Metric instance. + dialogues: Dialogues to evaluate. + quality_aspects: Quality aspects to evaluate for quality metrics. + utility_intents: Utility intent arguments. + + Returns: + Serialized metric result. + """ if metric_name == "quality": - return { - "aspects": { - aspect: { - "per_dialogue": scores, - "summary_by_agent": get_summary_by_agent(dialogues, scores), - } - for aspect in quality_aspects - for scores in [ - metric.evaluate_dialogues(dialogues, aspect=aspect) - ] + aspect_results = {} + for aspect in quality_aspects: + scores = metric.evaluate_dialogues(dialogues, aspect=aspect) + aspect_results[aspect] = { + "per_dialogue": scores, + "summary_by_agent": get_summary_by_agent(dialogues, scores), } - } + return {"aspects": aspect_results} if metric_name in { "success_rate", @@ -275,25 +343,12 @@ def evaluate_metric( } -def save_results( - config: confuse.Configuration, results: Dict[str, Any] -) -> None: - """Writes config dump and evaluation results to disk.""" - output_path = config["output"].get() - output_dir = os.path.dirname(output_path) - if output_dir: - os.makedirs(output_dir, exist_ok=True) - - output_stem, _ = os.path.splitext(output_path) - with open(f"{output_stem}.meta.yaml", "w") as f: - f.write(config.dump()) - - with open(output_path, "w") as f: - json.dump(results, f, indent=2) - +def print_summary(results: Dict[str, Any]) -> None: + """Prints a concise terminal summary. -def print_summary(results: Mapping[str, Any]) -> None: - """Prints a concise terminal summary.""" + Args: + results: Serialized evaluation results. + """ for metric_name, metric_result in results["metrics"].items(): print(f"Metric: {metric_name}") if metric_name == "quality": @@ -321,9 +376,10 @@ def main() -> None: config = load_config(args) metrics = config["metrics"].get() - quality_aspects = validate_config(config) + validate_config(config) + quality_aspects = config["quality_aspects"].get() dialogues = json_to_dialogues(config["dialogues"].get()) - annotate_for_utility(dialogues, config, metrics) + annotate_for_metrics(dialogues, config) utility_intents = get_utility_intents(config) metric_registry = build_metric_registry(config, metrics) @@ -343,7 +399,14 @@ def main() -> None: utility_intents, ) - save_results(config, results) + output_dir = config["output"].get() + output_stem, output_extension = os.path.splitext(output_dir) + if output_extension: + output_dir = output_stem + + with open(os.path.join(output_dir, "results.json"), "w") as f: + json.dump(results, f, indent=2) + print_summary(results) From 938ccac8208ca61598b9eb7571821a813f806aae Mon Sep 17 00:00:00 2001 From: Ksenia Blokhina Date: Tue, 21 Apr 2026 14:24:21 +0200 Subject: [PATCH 38/38] fix evaluation --- config/default/config_evaluation.yaml | 3 +- usersimcrs/run_evaluation.py | 119 +++++++++----------------- usersimcrs/utils/simulation_utils.py | 5 +- 3 files changed, 46 insertions(+), 81 deletions(-) diff --git a/config/default/config_evaluation.yaml b/config/default/config_evaluation.yaml index 74a60df1..6b63cd3e 100644 --- a/config/default/config_evaluation.yaml +++ b/config/default/config_evaluation.yaml @@ -1,10 +1,11 @@ dialogues: data/datasets/moviebot/annotated_dialogues.json +debug: False metrics: - satisfaction - success_rate - successful_recommendation_round_ratio - reward_per_dialogue_length -output: data/evaluation/moviebot_non_quality_results +output_dir: data/evaluation/moviebot_non_quality_results quality_llm_interface: llm_interface_class_path: "usersimcrs.llm_interfaces.ollama_interface.OllamaLLMInterface" diff --git a/usersimcrs/run_evaluation.py b/usersimcrs/run_evaluation.py index 1e1316b3..8f6eaf22 100644 --- a/usersimcrs/run_evaluation.py +++ b/usersimcrs/run_evaluation.py @@ -5,7 +5,7 @@ import os from collections import defaultdict from statistics import mean, stdev -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import confuse from dialoguekit.core.dialogue import Dialogue @@ -62,7 +62,8 @@ def parse_args() -> argparse.Namespace: help="Metrics to compute.", ) parser.add_argument( - "--output", + "--output-dir", + dest="output_dir", type=str, help="Directory to save evaluation results and metadata.", ) @@ -117,12 +118,16 @@ def load_config(args: argparse.Namespace) -> confuse.Configuration: config.set_file(args.config_file) config.set_args(args, dots=True) - output_dir = config["output"].get() + validate_config(config) + + output_dir = config["output_dir"].get() output_stem, output_extension = os.path.splitext(output_dir) if output_extension: output_dir = output_stem os.makedirs(output_dir, exist_ok=True) - with open(os.path.join(output_dir, "config.meta.yaml"), "w") as f: + with open( + os.path.join(output_dir, "config_evaluation.meta.yaml"), "w" + ) as f: f.write(config.dump()) return config @@ -135,10 +140,10 @@ def validate_config(config: confuse.Configuration) -> None: config: Configuration generated from YAML configuration file. Raises: - ValueError: If quality evaluation is requested without an LLM - interface, if an unknown quality aspect is configured, or if - dialogue annotation is requested without user and agent NLU - sections. + ValueError: If quality evaluation is requested without an LLM + interface, if an unknown quality aspect is configured, or if + dialogue annotation is requested without user and agent NLU + sections. """ metrics = config["metrics"].get() if "quality" in metrics and "quality_llm_interface" not in config: @@ -166,43 +171,17 @@ def validate_config(config: confuse.Configuration) -> None: ) -def load_nlu( - config: confuse.Configuration, nlu_config_key: str, name: str -) -> Any: - """Loads one NLU component from an evaluation config section. - - Args: - config: Evaluation configuration. - nlu_config_key: Name of the NLU section to load. - name: Name for the temporary NLU configuration. - - Returns: - NLU component. - """ - nlu_config = confuse.Configuration(name) - nlu_config.set( - { - "dialogues": config["dialogues"].get(), - "nlu": config[nlu_config_key].get(), - } - ) - return get_NLU(nlu_config) - - def annotate_for_metrics( dialogues: List[Dialogue], config: confuse.Configuration ) -> None: - """Annotates dialogues when requested by configuration. + """Annotates dialogues for metrics that require dialogue acts. Args: dialogues: Dialogues to annotate in place. config: Evaluation configuration. """ - if not config["annotate_dialogues"].get(): - return - - user_nlu = load_nlu(config, "user_nlu", "User NLU Configuration") - agent_nlu = load_nlu(config, "agent_nlu", "Agent NLU Configuration") + user_nlu = get_NLU(config, nlu_config_key="user_nlu") + agent_nlu = get_NLU(config, nlu_config_key="agent_nlu") annotate_dialogues(dialogues, user_nlu, agent_nlu) @@ -236,31 +215,6 @@ def get_summary_by_agent( } -def get_utility_intents( - config: confuse.Configuration, -) -> Dict[str, List[Intent]]: - """Builds intent lists used by utility metrics. - - Args: - config: Evaluation configuration. - - Returns: - Utility intent lists keyed by metric argument name. - """ - return { - "recommendation_intents": [ - Intent(label) - for label in config["recommendation_intent_labels"].get() - ], - "acceptance_intents": [ - Intent(label) for label in config["accept_intent_labels"].get() - ], - "rejection_intents": [ - Intent(label) for label in config["reject_intent_labels"].get() - ], - } - - def build_metric_registry( config: confuse.Configuration, metrics: List[str] ) -> Dict[str, BaseMetric]: @@ -296,27 +250,25 @@ def build_metric_registry( def evaluate_metric( - metric_name: str, metric: BaseMetric, dialogues: List[Dialogue], - quality_aspects: List[str], - utility_intents: Dict[str, List[Intent]], + quality_aspects: Optional[List[str]] = None, + utility_intents: Optional[Dict[str, List[Intent]]] = None, ) -> Dict[str, Any]: """Evaluates one metric and returns serialized results. Args: - metric_name: Name of the metric to evaluate. metric: Metric instance. dialogues: Dialogues to evaluate. quality_aspects: Quality aspects to evaluate for quality metrics. - utility_intents: Utility intent arguments. + utility_intents: Utility intent arguments for utility metrics. Returns: Serialized metric result. """ - if metric_name == "quality": + if metric.name == "quality": aspect_results = {} - for aspect in quality_aspects: + for aspect in quality_aspects or []: scores = metric.evaluate_dialogues(dialogues, aspect=aspect) aspect_results[aspect] = { "per_dialogue": scores, @@ -324,12 +276,13 @@ def evaluate_metric( } return {"aspects": aspect_results} - if metric_name in { + utility_intents = utility_intents or {} + if metric.name in { "success_rate", "successful_recommendation_round_ratio", }: scores = metric.evaluate_dialogues(dialogues, **utility_intents) - elif metric_name == "reward_per_dialogue_length": + elif metric.name == "reward_per_dialogue_length": scores = metric.evaluate_dialogues( dialogues, acceptance_intents=utility_intents["acceptance_intents"], @@ -376,12 +329,23 @@ def main() -> None: config = load_config(args) metrics = config["metrics"].get() - validate_config(config) quality_aspects = config["quality_aspects"].get() dialogues = json_to_dialogues(config["dialogues"].get()) - annotate_for_metrics(dialogues, config) + if config["annotate_dialogues"].get(): + annotate_for_metrics(dialogues, config) - utility_intents = get_utility_intents(config) + utility_intents = { + "recommendation_intents": [ + Intent(label) + for label in config["recommendation_intent_labels"].get() + ], + "acceptance_intents": [ + Intent(label) for label in config["accept_intent_labels"].get() + ], + "rejection_intents": [ + Intent(label) for label in config["reject_intent_labels"].get() + ], + } metric_registry = build_metric_registry(config, metrics) results: Dict[str, Any] = { @@ -392,14 +356,13 @@ def main() -> None: for metric_name in metrics: results["metrics"][metric_name] = evaluate_metric( - metric_name, metric_registry[metric_name], dialogues, - quality_aspects, - utility_intents, + quality_aspects=quality_aspects, + utility_intents=utility_intents, ) - output_dir = config["output"].get() + output_dir = config["output_dir"].get() output_stem, output_extension = os.path.splitext(output_dir) if output_extension: output_dir = output_stem diff --git a/usersimcrs/utils/simulation_utils.py b/usersimcrs/utils/simulation_utils.py index 003ffcae..ce388c61 100644 --- a/usersimcrs/utils/simulation_utils.py +++ b/usersimcrs/utils/simulation_utils.py @@ -188,11 +188,12 @@ def _get_agenda_based_simulator_config( } -def get_NLU(config: confuse.Configuration) -> NLU: +def get_NLU(config: confuse.Configuration, nlu_config_key: str = "nlu") -> NLU: """Returns an NLU component. Args: config: Configuration for the simulation. + nlu_config_key: Configuration key containing the NLU settings. Raises: ValueError: Unsupported intent classifier. @@ -200,7 +201,7 @@ def get_NLU(config: confuse.Configuration) -> NLU: Returns: An NLU component. """ - nlu_config = config["nlu"].get() + nlu_config = config[nlu_config_key].get() intent_classifier = nlu_config.get("type") if intent_classifier == "cosine": # NLU without slot annotators