From 4f73fdc978ff086faa85cd04bd3bb633e652e3b2 Mon Sep 17 00:00:00 2001 From: johnz Date: Tue, 16 Jun 2026 22:48:55 +0800 Subject: [PATCH 1/6] eval: add HaluMem-medium smoke-subset helper --- .gitignore | 4 +++- eval/scripts/make_smoke_subset.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 eval/scripts/make_smoke_subset.py diff --git a/.gitignore b/.gitignore index 029a928..461311f 100644 --- a/.gitignore +++ b/.gitignore @@ -141,4 +141,6 @@ data/native_math_gpt-oss-120b-.jsonl data/eli5.parquet # eval results -eval/results/* \ No newline at end of file +eval/results/* +eval/data/* +.venv-halumem-download/ diff --git a/eval/scripts/make_smoke_subset.py b/eval/scripts/make_smoke_subset.py new file mode 100644 index 0000000..9026fde --- /dev/null +++ b/eval/scripts/make_smoke_subset.py @@ -0,0 +1,17 @@ +import json +import sys + + +def main(src: str, dst: str, n: int) -> None: + with open(src, encoding="utf-8") as f_in, open(dst, "w", encoding="utf-8") as f_out: + written = 0 + for i, line in enumerate(f_in): + if i >= n: + break + f_out.write(line if line.endswith("\n") else line + "\n") + written += 1 + print(f"wrote {written} users -> {dst}") + + +if __name__ == "__main__": + main(sys.argv[1], sys.argv[2], int(sys.argv[3])) From 79fff29b36af596d2ba0bec9606de021cb6bf538 Mon Sep 17 00:00:00 2001 From: johnz Date: Tue, 16 Jun 2026 23:25:12 +0800 Subject: [PATCH 2/6] eval: add arika artifact->results assembler --- eval/eval_arika.py | 62 +++++++++++++++++++ .../fixtures/arika_artifact_one_user.jsonl | 1 + eval/tests/test_eval_arika.py | 40 ++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 eval/eval_arika.py create mode 100644 eval/tests/fixtures/arika_artifact_one_user.jsonl create mode 100644 eval/tests/test_eval_arika.py diff --git a/eval/eval_arika.py b/eval/eval_arika.py new file mode 100644 index 0000000..acd3ae6 --- /dev/null +++ b/eval/eval_arika.py @@ -0,0 +1,62 @@ +import argparse +import json +import os + +from prompts import PROMPT_MEMZERO + + +def _iter_jsonl(path: str): + with open(path, encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + yield json.loads(line) + + +def _default_llm(prompt: str) -> str: + from llms import llm_request + + return llm_request(prompt) + + +def assemble_results(artifact_path: str, mode: str, llm=_default_llm): + users = [] + for user in _iter_jsonl(artifact_path): + for session in user["sessions"]: + arika_qs = session.get("arika_questions", []) or [] + for index, qa in enumerate(session.get("questions", []) or []): + arika = arika_qs[index] if index < len(arika_qs) else {} + qa["context"] = arika.get("context", "") + if "search_duration_ms" in arika: + qa["search_duration_ms"] = arika["search_duration_ms"] + if "answer_duration_ms" in arika: + qa["response_duration_ms"] = arika["answer_duration_ms"] + if mode == "e2e": + qa["system_response"] = arika.get("arika_answer", "") + else: + prompt = PROMPT_MEMZERO.format(context=qa["context"], question=qa["question"]) + qa["system_response"] = llm(prompt) + session.pop("arika_questions", None) + users.append(user) + return users + + +def main(artifact_path: str, frame: str, version: str, mode: str): + save_dir = f"results/{frame}-{version}/" + os.makedirs(save_dir, exist_ok=True) + out = os.path.join(save_dir, f"{frame}_eval_results.jsonl") + users = assemble_results(artifact_path, mode=mode) + with open(out, "w", encoding="utf-8") as f: + for user in users: + f.write(json.dumps(user, ensure_ascii=False) + "\n") + print(f"wrote {out}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--artifact", required=True) + parser.add_argument("--frame", default="arika", choices=["arika", "arika-e2e", "arika-points"]) + parser.add_argument("--version", default="smoke") + parser.add_argument("--mode", default="retrieval", choices=["retrieval", "e2e"]) + args = parser.parse_args() + main(args.artifact, args.frame, args.version, args.mode) diff --git a/eval/tests/fixtures/arika_artifact_one_user.jsonl b/eval/tests/fixtures/arika_artifact_one_user.jsonl new file mode 100644 index 0000000..58ed775 --- /dev/null +++ b/eval/tests/fixtures/arika_artifact_one_user.jsonl @@ -0,0 +1 @@ +{"uuid":"u1","user_name":"Martin Mark","backend":"cloud","sessions":[{"start_time":"Sep 04, 2025, 18:42:18","dialogue":[{"role":"user","content":"I live in Columbus.","timestamp":"Sep 04, 2025, 18:42:18"}],"memory_points":[{"memory_content":"User lives in Columbus","is_update":"False","memory_source":"system"}],"questions":[{"question":"Where does Martin live?","answer":"Columbus","evidence":[{"memory_content":"User lives in Columbus"}],"category":"Basic Fact Recall"}],"extracted_memories":["2025-09-04 · Martin Mark: Lives in Columbus"],"add_dialogue_duration_ms":10.0,"arika_questions":[{"context":"Memories for user Martin Mark:\n[\"Lives in Columbus\"]","arika_answer":"Columbus","search_duration_ms":1.0,"answer_duration_ms":2.0}]}]} diff --git a/eval/tests/test_eval_arika.py b/eval/tests/test_eval_arika.py new file mode 100644 index 0000000..0dd6697 --- /dev/null +++ b/eval/tests/test_eval_arika.py @@ -0,0 +1,40 @@ +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from eval_arika import assemble_results + +FIXTURE = os.path.join(os.path.dirname(__file__), "fixtures", "arika_artifact_one_user.jsonl") + + +def _fake_llm(prompt: str) -> str: + return "Columbus" + + +def test_retrieval_only_frame_injects_system_response(): + users = assemble_results(FIXTURE, mode="retrieval", llm=_fake_llm) + qa = users[0]["sessions"][0]["questions"][0] + assert qa["system_response"] == "Columbus" + assert "context" in qa + assert users[0]["sessions"][0]["extracted_memories"] + + +def test_e2e_frame_uses_arika_answer(): + users = assemble_results(FIXTURE, mode="e2e", llm=None) + qa = users[0]["sessions"][0]["questions"][0] + assert qa["system_response"] == "Columbus" + + +def test_no_memories_from_system_so_no_update_records(): + users = assemble_results(FIXTURE, mode="retrieval", llm=_fake_llm) + mp = users[0]["sessions"][0]["memory_points"] + assert all("memories_from_system" not in m for m in mp) + + +def test_preserves_scorer_required_qa_fields_and_copies_timing(): + users = assemble_results(FIXTURE, mode="retrieval", llm=_fake_llm) + qa = users[0]["sessions"][0]["questions"][0] + assert qa["answer"] == "Columbus" + assert qa["evidence"][0]["memory_content"] == "User lives in Columbus" + assert qa["search_duration_ms"] == 1.0 From cc4b0a3b35f213e2a63a440c9fb7b479a6281a95 Mon Sep 17 00:00:00 2001 From: johnz Date: Tue, 16 Jun 2026 23:31:53 +0800 Subject: [PATCH 3/6] eval: register arika frames + guard zero-count divisions --- eval/evaluation.py | 63 ++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/eval/evaluation.py b/eval/evaluation.py index 696e838..1e6cda8 100644 --- a/eval/evaluation.py +++ b/eval/evaluation.py @@ -25,11 +25,17 @@ def compute_f1(precision: float, recall: float) -> float: Returns: float: F1-score """ + if precision is None or recall is None: + return None if precision + recall == 0: return 0.0 return 2 * (precision * recall) / (precision + recall) +def _safe_ratio(numerator, denominator): + return (numerator / denominator) if denominator else None + + def process_user(idx: int, user_data: dict, max_workers: int = 10): uuid = user_data["uuid"] user_name = user_data["user_name"] @@ -236,16 +242,16 @@ def aggregate_eval_results(eval_results): interference_memory_scores += 1 interference_memory_valid_num += 1 - eval_results["overall_score"]["memory_integrity"]["recall(all)"] = memory_integrity_scores / memory_integrity_num - eval_results["overall_score"]["memory_integrity"]["recall(valid)"] = memory_integrity_scores / memory_integrity_valid_num - eval_results["overall_score"]["memory_integrity"]["weighted_recall(all)"] = memory_integrity_weighted_scores / memory_integrity_weighted_num - eval_results["overall_score"]["memory_integrity"]["weighted_recall(valid)"] = memory_integrity_weighted_scores / memory_integrity_weighted_valid_num + eval_results["overall_score"]["memory_integrity"]["recall(all)"] = _safe_ratio(memory_integrity_scores, memory_integrity_num) + eval_results["overall_score"]["memory_integrity"]["recall(valid)"] = _safe_ratio(memory_integrity_scores, memory_integrity_valid_num) + eval_results["overall_score"]["memory_integrity"]["weighted_recall(all)"] = _safe_ratio(memory_integrity_weighted_scores, memory_integrity_weighted_num) + eval_results["overall_score"]["memory_integrity"]["weighted_recall(valid)"] = _safe_ratio(memory_integrity_weighted_scores, memory_integrity_weighted_valid_num) eval_results["overall_score"]["memory_integrity"]["memory_valid_importance_sum"] = memory_integrity_weighted_valid_num eval_results["overall_score"]["memory_integrity"]["memory_importance_sum"] = memory_integrity_weighted_num eval_results["overall_score"]["memory_integrity"]["memory_valid_num"] = memory_integrity_valid_num eval_results["overall_score"]["memory_integrity"]["memory_num"] = memory_integrity_num - eval_results["overall_score"]["memory_accuracy"]["interference_accuracy(all)"] = interference_memory_scores / interference_memory_num - eval_results["overall_score"]["memory_accuracy"]["interference_accuracy(valid)"] = interference_memory_scores / interference_memory_valid_num + eval_results["overall_score"]["memory_accuracy"]["interference_accuracy(all)"] = _safe_ratio(interference_memory_scores, interference_memory_num) + eval_results["overall_score"]["memory_accuracy"]["interference_accuracy(valid)"] = _safe_ratio(interference_memory_scores, interference_memory_valid_num) eval_results["overall_score"]["memory_accuracy"]["interference_memory_valid_num"] = interference_memory_valid_num eval_results["overall_score"]["memory_accuracy"]["interference_memory_num"] = interference_memory_num @@ -276,12 +282,12 @@ def aggregate_eval_results(eval_results): memory_accuracy_weighted_scores += 0.5 * item["memory_accuracy_score"] memory_accuracy_valid_num += 1 - eval_results["overall_score"]["memory_accuracy"]["target_accuracy(all)"] = target_memory_accuracy_scores / target_memory_accuracy_num - eval_results["overall_score"]["memory_accuracy"]["target_accuracy(valid)"] = target_memory_accuracy_scores / target_memory_accuracy_valid_num + eval_results["overall_score"]["memory_accuracy"]["target_accuracy(all)"] = _safe_ratio(target_memory_accuracy_scores, target_memory_accuracy_num) + eval_results["overall_score"]["memory_accuracy"]["target_accuracy(valid)"] = _safe_ratio(target_memory_accuracy_scores, target_memory_accuracy_valid_num) eval_results["overall_score"]["memory_accuracy"]["target_memory_valid_num"] = target_memory_accuracy_valid_num eval_results["overall_score"]["memory_accuracy"]["target_memory_num"] = target_memory_accuracy_num - eval_results["overall_score"]["memory_accuracy"]["weighted_accuracy(all)"] = memory_accuracy_weighted_scores / memory_accuracy_num - eval_results["overall_score"]["memory_accuracy"]["weighted_accuracy(valid)"] = memory_accuracy_weighted_scores / memory_accuracy_valid_num + eval_results["overall_score"]["memory_accuracy"]["weighted_accuracy(all)"] = _safe_ratio(memory_accuracy_weighted_scores, memory_accuracy_num) + eval_results["overall_score"]["memory_accuracy"]["weighted_accuracy(valid)"] = _safe_ratio(memory_accuracy_weighted_scores, memory_accuracy_valid_num) eval_results["overall_score"]["memory_accuracy"]["memory_valid_num"] = memory_accuracy_valid_num eval_results["overall_score"]["memory_accuracy"]["memory_num"] = memory_accuracy_num @@ -318,14 +324,14 @@ def aggregate_eval_results(eval_results): update_memory_valid_num += 1 - eval_results["overall_score"]["memory_update"]["correct_update_memory_ratio(all)"] = correct_update_memory_num / update_memory_num - eval_results["overall_score"]["memory_update"]["correct_update_memory_ratio(valid)"] = correct_update_memory_num / update_memory_valid_num - eval_results["overall_score"]["memory_update"]["hallucination_update_memory_ratio(all)"] = hallucination_update_memory_num / update_memory_num - eval_results["overall_score"]["memory_update"]["hallucination_update_memory_ratio(valid)"] = hallucination_update_memory_num / update_memory_valid_num - eval_results["overall_score"]["memory_update"]["omission_update_memory_ratio(all)"] = omission_update_memory_num / update_memory_num - eval_results["overall_score"]["memory_update"]["omission_update_memory_ratio(valid)"] = omission_update_memory_num / update_memory_valid_num - eval_results["overall_score"]["memory_update"]["other_update_memory_ratio(all)"] = other_update_memory_num / update_memory_num - eval_results["overall_score"]["memory_update"]["other_update_memory_ratio(valid)"] = other_update_memory_num / update_memory_valid_num + eval_results["overall_score"]["memory_update"]["correct_update_memory_ratio(all)"] = _safe_ratio(correct_update_memory_num, update_memory_num) + eval_results["overall_score"]["memory_update"]["correct_update_memory_ratio(valid)"] = _safe_ratio(correct_update_memory_num, update_memory_valid_num) + eval_results["overall_score"]["memory_update"]["hallucination_update_memory_ratio(all)"] = _safe_ratio(hallucination_update_memory_num, update_memory_num) + eval_results["overall_score"]["memory_update"]["hallucination_update_memory_ratio(valid)"] = _safe_ratio(hallucination_update_memory_num, update_memory_valid_num) + eval_results["overall_score"]["memory_update"]["omission_update_memory_ratio(all)"] = _safe_ratio(omission_update_memory_num, update_memory_num) + eval_results["overall_score"]["memory_update"]["omission_update_memory_ratio(valid)"] = _safe_ratio(omission_update_memory_num, update_memory_valid_num) + eval_results["overall_score"]["memory_update"]["other_update_memory_ratio(all)"] = _safe_ratio(other_update_memory_num, update_memory_num) + eval_results["overall_score"]["memory_update"]["other_update_memory_ratio(valid)"] = _safe_ratio(other_update_memory_num, update_memory_valid_num) eval_results["overall_score"]["memory_update"]["update_memory_valid_num"] = update_memory_valid_num eval_results["overall_score"]["memory_update"]["update_memory_num"] = update_memory_num @@ -352,12 +358,12 @@ def aggregate_eval_results(eval_results): qa_valid_num += 1 - eval_results["overall_score"]["question_answering"]["correct_qa_ratio(all)"] = correct_qa_num / qa_num - eval_results["overall_score"]["question_answering"]["correct_qa_ratio(valid)"] = correct_qa_num / qa_valid_num - eval_results["overall_score"]["question_answering"]["hallucination_qa_ratio(all)"] = hallucination_qa_num / qa_num - eval_results["overall_score"]["question_answering"]["hallucination_qa_ratio(valid)"] = hallucination_qa_num / qa_valid_num - eval_results["overall_score"]["question_answering"]["omission_qa_ratio(all)"] = omission_qa_num / qa_num - eval_results["overall_score"]["question_answering"]["omission_qa_ratio(valid)"] = omission_qa_num / qa_valid_num + eval_results["overall_score"]["question_answering"]["correct_qa_ratio(all)"] = _safe_ratio(correct_qa_num, qa_num) + eval_results["overall_score"]["question_answering"]["correct_qa_ratio(valid)"] = _safe_ratio(correct_qa_num, qa_valid_num) + eval_results["overall_score"]["question_answering"]["hallucination_qa_ratio(all)"] = _safe_ratio(hallucination_qa_num, qa_num) + eval_results["overall_score"]["question_answering"]["hallucination_qa_ratio(valid)"] = _safe_ratio(hallucination_qa_num, qa_valid_num) + eval_results["overall_score"]["question_answering"]["omission_qa_ratio(all)"] = _safe_ratio(omission_qa_num, qa_num) + eval_results["overall_score"]["question_answering"]["omission_qa_ratio(valid)"] = _safe_ratio(omission_qa_num, qa_valid_num) eval_results["overall_score"]["question_answering"]["qa_valid_num"] = qa_valid_num eval_results["overall_score"]["question_answering"]["qa_num"] = qa_num @@ -378,8 +384,8 @@ def aggregate_eval_results(eval_results): for key in eval_results["overall_score"]["memory_type_accuracy"]: - eval_results["overall_score"]["memory_type_accuracy"][key]['memory_integrity_acc'] = eval_results["overall_score"]["memory_type_accuracy"][key]['memory_integrity_acc'] / eval_results["overall_score"]["memory_type_accuracy"][key]['total_num'] - eval_results["overall_score"]["memory_type_accuracy"][key]['memory_update_acc'] = eval_results["overall_score"]["memory_type_accuracy"][key]['memory_update_acc'] / eval_results["overall_score"]["memory_type_accuracy"][key]['total_num'] + eval_results["overall_score"]["memory_type_accuracy"][key]['memory_integrity_acc'] = _safe_ratio(eval_results["overall_score"]["memory_type_accuracy"][key]['memory_integrity_acc'], eval_results["overall_score"]["memory_type_accuracy"][key]['total_num']) or 0 + eval_results["overall_score"]["memory_type_accuracy"][key]['memory_update_acc'] = _safe_ratio(eval_results["overall_score"]["memory_type_accuracy"][key]['memory_update_acc'], eval_results["overall_score"]["memory_type_accuracy"][key]['total_num']) or 0 eval_results["overall_score"]["memory_type_accuracy"][key]['memory_acc'] = eval_results["overall_score"]["memory_type_accuracy"][key]['memory_integrity_acc'] + eval_results["overall_score"]["memory_type_accuracy"][key]['memory_update_acc'] return eval_results @@ -510,7 +516,10 @@ def main( parser.add_argument( "--frame", type=str, - choices=["memzero", "memzero-graph", "zep", "memos", "memobase", "supermemory"], + choices=[ + "memzero", "memzero-graph", "zep", "memos", "memobase", "supermemory", + "arika", "arika-e2e", "arika-points", + ], ) parser.add_argument( "--version", From 07899c695f472f861e3da171cfdc8411cfbd21a8 Mon Sep 17 00:00:00 2001 From: johnz Date: Wed, 17 Jun 2026 00:17:20 +0800 Subject: [PATCH 4/6] eval: accept bare JSON from judge model in llm_request_for_json gpt-5.5 (and others) emit unfenced JSON; the strict ```json``` regex discarded valid judgments, marking every integrity/QA score invalid (recall=0, qa=0 artifacts). Fall back to whole-content then first {...}. Co-Authored-By: Claude Opus 4.8 --- eval/llms.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/eval/llms.py b/eval/llms.py index 56de13d..6b4d962 100644 --- a/eval/llms.py +++ b/eval/llms.py @@ -86,13 +86,24 @@ def llm_request_for_json(prompt): content = response_obj.choices[0].message.content or "" + # Preferred: a fenced ```json ... ``` block. match = re.search(r"```json\s*(\{.*?\})\s*```", content, re.DOTALL) - if not match: - raise ValueError(f"No JSON block found in model output: {content}") - - json_str = match.group(1).strip() - - return json.loads(json_str) + if match: + return json.loads(match.group(1).strip()) + + # Fallback: some models (e.g. gpt-5.5 via the proxy) emit bare JSON + # with no code fence. Try the whole content, then the first {...} span. + stripped = content.strip() + try: + return json.loads(stripped) + except json.JSONDecodeError: + pass + + span = re.search(r"\{.*\}", stripped, re.DOTALL) + if span: + return json.loads(span.group(0)) + + raise ValueError(f"No JSON block found in model output: {content}") if __name__ == '__main__': From 0cdb42c379bd8243bcfc77fca14935a6bfdc97f7 Mon Sep 17 00:00:00 2001 From: johnz Date: Fri, 19 Jun 2026 13:29:46 +0800 Subject: [PATCH 5/6] eval: support Arika-native dataset shape + configurable judge workers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - QA evidence can be a list of plain strings (Arika-native set) as well as the HaluMem-Medium list of {memory_content} dicts — handle both. - memory_type accuracy seeds unseen types on demand (Arika uses "Event Memory" / "Topic Memory"), instead of KeyError on a fixed type set. - max_workers reads MAX_WORKERS env so the LLM judge can be throttled to share a rate-limited proxy with a concurrent replay. Co-Authored-By: Claude Opus 4.8 --- eval/evaluation.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/eval/evaluation.py b/eval/evaluation.py index 1e6cda8..243b48e 100644 --- a/eval/evaluation.py +++ b/eval/evaluation.py @@ -185,9 +185,12 @@ def process_user(idx: int, user_data: dict, max_workers: int = 10): for qa in question_answering_inputs: future = executor.submit( evaluation_for_question, - qa["question"], - qa["answer"], - "\n".join([i["memory_content"] for i in qa["evidence"]]), + qa["question"], + qa["answer"], + # evidence is a list of {memory_content: ...} dicts in + # HaluMem-Medium, but a list of plain strings in the + # Arika-native set — handle both. + "\n".join([i["memory_content"] if isinstance(i, dict) else str(i) for i in qa["evidence"]]), qa["system_response"] ) futures[future] = qa @@ -367,20 +370,25 @@ def aggregate_eval_results(eval_results): eval_results["overall_score"]["question_answering"]["qa_valid_num"] = qa_valid_num eval_results["overall_score"]["question_answering"]["qa_num"] = qa_num - # Memory Type Accuracy + # Memory Type Accuracy. Memory-type names vary by dataset (HaluMem-Medium + # vs the Arika-native "Event Memory"/"Topic Memory"), so seed any unseen + # type on demand rather than assuming a fixed set. + mta = eval_results["overall_score"]["memory_type_accuracy"] + def _mta(t): + return mta.setdefault(t, {"memory_integrity_acc": 0, "memory_update_acc": 0, "total_num": 0}) for item in eval_results['memory_integrity_records']: if 'memory_integrity_score' not in item or 'importance' not in item: continue score = 1 if item['memory_integrity_score'] == 2 else 0 - eval_results["overall_score"]["memory_type_accuracy"][item['memory_type']]['memory_integrity_acc'] += score - eval_results["overall_score"]["memory_type_accuracy"][item['memory_type']]['total_num'] += 1 + _mta(item['memory_type'])['memory_integrity_acc'] += score + _mta(item['memory_type'])['total_num'] += 1 for item in eval_results["memory_update_records"]: if 'memory_update_type' not in item or 'importance' not in item: continue score = 1 if item['memory_update_type'] == "Correct" else 0 - eval_results["overall_score"]["memory_type_accuracy"][item['memory_type']]['memory_update_acc'] += score - eval_results["overall_score"]["memory_type_accuracy"][item['memory_type']]['total_num'] += 1 + _mta(item['memory_type'])['memory_update_acc'] += score + _mta(item['memory_type'])['total_num'] += 1 for key in eval_results["overall_score"]["memory_type_accuracy"]: @@ -403,7 +411,7 @@ def main( frame: str, version: str = "default", user_num: int = 20, - max_workers: int = 10 + max_workers: int = int(os.environ.get("MAX_WORKERS", "10")) ): dir_path = f"results/{frame}-{version}/" data_path = f"{dir_path}/{frame}_eval_results.jsonl" From 82d1321ae346a0db62ebce20652f354b9821150c Mon Sep 17 00:00:00 2001 From: johnz Date: Fri, 19 Jun 2026 13:29:46 +0800 Subject: [PATCH 6/6] eval: add lenient (core-fact) QA judge variant via JUDGE_MODE Diagnostic-only judge that keeps the official strict rubric by default but, under JUDGE_MODE=lenient, scores an answer Correct when its core facts match the reference even if it volunteers extra non-contradictory detail (only contradictions, or asserting a definite fact when the reference is unknown, count as Hallucination). Used to quantify how much strict "hallucination" is pure over-answering vs genuine error. Does not change the default scores. Co-Authored-By: Claude Opus 4.8 --- eval/eval_tools.py | 69 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/eval/eval_tools.py b/eval/eval_tools.py index fd46292..0b6df0e 100644 --- a/eval/eval_tools.py +++ b/eval/eval_tools.py @@ -283,6 +283,66 @@ """ +# Lenient diagnostic variant (NOT the official rubric). Identical to the strict +# judge EXCEPT it does not penalize extra, non-contradictory detail: an answer +# whose core facts match the Reference Answer is Correct even if it volunteers +# additional related information, as long as nothing contradicts the Key Memory +# Points / Reference Answer. Used only to quantify how much "Hallucination" +# under the strict judge is pure over-answering vs genuine contradiction. +EVALUATION_PROMPT_FOR_QUESTION_LENIENT = """You are an **evaluation expert for AI memory system question answering**. +Based **only** on the provided **“Question”**, **“Reference Answer”**, and **“Key Memory Points”**, evaluate whether the **“Memory System Response”** gets the **core answer** right. Classify it as one of **“Correct”**, **“Hallucination”**, or **“Omission.”** Do **not** use any external knowledge. Output **strictly** in the specified JSON format. + +# Evaluation Criteria (LENIENT — reward correct core facts, do not punish verbosity) + +## 1. Correct +* The response conveys the **core/required facts** of the “Reference Answer” (semantically equivalent on the essential points). Synonyms, paraphrasing, summarization are fine. +* **Additional related details beyond the Key Memory Points are ALLOWED and must NOT be penalized**, as long as they do **not contradict** the “Reference Answer” or “Key Memory Points.” Extra context, elaboration, or volunteered facts that are merely unverified (but not contradicted) do **NOT** make the answer wrong. + +## 2. Hallucination +* The response **directly contradicts** the “Reference Answer” or “Key Memory Points” on a factual point (wrong value, wrong entity, wrong time, opposite conclusion). +* OR the “Reference Answer” is *unknown / cannot be determined*, yet the response asserts a **specific definite** answer. +* Extra non-contradictory information is **NOT** a hallucination by itself. + +## 3. Omission +* The response **misses a required core element** of the “Reference Answer,” OR explicitly says “don’t know / can’t remember / no related memory” even though the info exists in the Key Memory Points. +* For multi-element questions, judge only on the **core required elements**; extra elements never hurt. + +## Priority Rules +* Any direct **contradiction** (or asserting a definite fact when the reference is unknown) → **Hallucination**. +* Else, if a **core required element is missing** → **Omission**. +* Else (core facts correct, only extra non-contradictory detail added) → **Correct**. + +## Tolerance +* Numbers/times/units: equivalent expressions OK, but the **values must not differ**. +* Rely **only** on Reference Answer, Key Memory Points, and System Response. + +# Information for Evaluation + +* **Question:** + {question} + +* **Reference Answer:** + {reference_answer} + +* **Key Memory Points:** + {key_memory_points} + +* **Memory System Response:** + {response} + +# Output Requirements + +Output **strictly** the JSON below, no extra text. + +```json +{{ + "reasoning": "State the core required facts, whether the response got them right, and whether anything directly contradicts the Key Memory Points / Reference Answer (ignore merely-extra non-contradictory detail). Then give the classification basis.", + "evaluation_result": "Correct | Hallucination | Omission" +}} +``` +""" + + def evaluation_for_memory_integrity( extract_memories: str, target_memory: str @@ -362,7 +422,14 @@ def evaluation_for_question( key_memory_points: The memory points used to derive the reference answer. response: The answer produced by the memory system. """ - prompt = EVALUATION_PROMPT_FOR_QUESTION.format( + # JUDGE_MODE=lenient swaps in the diagnostic lenient rubric (does not + # penalize extra non-contradictory detail). Default keeps the official + # strict rubric so leaderboard scores stay comparable. + import os + template = (EVALUATION_PROMPT_FOR_QUESTION_LENIENT + if os.environ.get("JUDGE_MODE", "strict").lower() == "lenient" + else EVALUATION_PROMPT_FOR_QUESTION) + prompt = template.format( question=question, reference_answer=reference_answer, key_memory_points=key_memory_points,