lay2dev · johnz1019 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -141,4 +141,6 @@ data/native_math_gpt-oss-120b-.jsonl
 data/eli5.parquet
 
 # eval results
-eval/results/*
+eval/results/*
+eval/data/*
+.venv-halumem-download/
diff --git a/eval/eval_arika.py b/eval/eval_arika.py
@@ -0,0 +1,62 @@
+import argparse
+import json
+import os
+
+from prompts import PROMPT_MEMZERO
+
+
+def _iter_jsonl(path: str):
+    with open(path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                yield json.loads(line)
+
+
+def _default_llm(prompt: str) -> str:
+    from llms import llm_request
+
+    return llm_request(prompt)
+
+
+def assemble_results(artifact_path: str, mode: str, llm=_default_llm):
+    users = []
+    for user in _iter_jsonl(artifact_path):
+        for session in user["sessions"]:
+            arika_qs = session.get("arika_questions", []) or []
+            for index, qa in enumerate(session.get("questions", []) or []):
+                arika = arika_qs[index] if index < len(arika_qs) else {}
+                qa["context"] = arika.get("context", "")
+                if "search_duration_ms" in arika:
+                    qa["search_duration_ms"] = arika["search_duration_ms"]
+                if "answer_duration_ms" in arika:
+                    qa["response_duration_ms"] = arika["answer_duration_ms"]
+                if mode == "e2e":
+                    qa["system_response"] = arika.get("arika_answer", "")
+                else:
+                    prompt = PROMPT_MEMZERO.format(context=qa["context"], question=qa["question"])
+                    qa["system_response"] = llm(prompt)
+            session.pop("arika_questions", None)
+        users.append(user)
+    return users
+
+
+def main(artifact_path: str, frame: str, version: str, mode: str):
+    save_dir = f"results/{frame}-{version}/"
+    os.makedirs(save_dir, exist_ok=True)
+    out = os.path.join(save_dir, f"{frame}_eval_results.jsonl")
+    users = assemble_results(artifact_path, mode=mode)
+    with open(out, "w", encoding="utf-8") as f:
+        for user in users:
+            f.write(json.dumps(user, ensure_ascii=False) + "\n")
+    print(f"wrote {out}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--artifact", required=True)
+    parser.add_argument("--frame", default="arika", choices=["arika", "arika-e2e", "arika-points"])
+    parser.add_argument("--version", default="smoke")
+    parser.add_argument("--mode", default="retrieval", choices=["retrieval", "e2e"])
+    args = parser.parse_args()
+    main(args.artifact, args.frame, args.version, args.mode)
diff --git a/eval/eval_tools.py b/eval/eval_tools.py
@@ -283,6 +283,66 @@
 """
 
 
+# Lenient diagnostic variant (NOT the official rubric). Identical to the strict
+# judge EXCEPT it does not penalize extra, non-contradictory detail: an answer
+# whose core facts match the Reference Answer is Correct even if it volunteers
+# additional related information, as long as nothing contradicts the Key Memory
+# Points / Reference Answer. Used only to quantify how much "Hallucination"
+# under the strict judge is pure over-answering vs genuine contradiction.
+EVALUATION_PROMPT_FOR_QUESTION_LENIENT = """You are an **evaluation expert for AI memory system question answering**.
+Based **only** on the provided **“Question”**, **“Reference Answer”**, and **“Key Memory Points”**, evaluate whether the **“Memory System Response”** gets the **core answer** right. Classify it as one of **“Correct”**, **“Hallucination”**, or **“Omission.”** Do **not** use any external knowledge. Output **strictly** in the specified JSON format.
+
+# Evaluation Criteria (LENIENT — reward correct core facts, do not punish verbosity)
+
+## 1. Correct
+* The response conveys the **core/required facts** of the “Reference Answer” (semantically equivalent on the essential points). Synonyms, paraphrasing, summarization are fine.
+* **Additional related details beyond the Key Memory Points are ALLOWED and must NOT be penalized**, as long as they do **not contradict** the “Reference Answer” or “Key Memory Points.” Extra context, elaboration, or volunteered facts that are merely unverified (but not contradicted) do **NOT** make the answer wrong.
+
+## 2. Hallucination
+* The response **directly contradicts** the “Reference Answer” or “Key Memory Points” on a factual point (wrong value, wrong entity, wrong time, opposite conclusion).
+* OR the “Reference Answer” is *unknown / cannot be determined*, yet the response asserts a **specific definite** answer.
+* Extra non-contradictory information is **NOT** a hallucination by itself.
+
+## 3. Omission
+* The response **misses a required core element** of the “Reference Answer,” OR explicitly says “don’t know / can’t remember / no related memory” even though the info exists in the Key Memory Points.
+* For multi-element questions, judge only on the **core required elements**; extra elements never hurt.
+
+## Priority Rules
+* Any direct **contradiction** (or asserting a definite fact when the reference is unknown) → **Hallucination**.
+* Else, if a **core required element is missing** → **Omission**.
+* Else (core facts correct, only extra non-contradictory detail added) → **Correct**.
+
+## Tolerance
+* Numbers/times/units: equivalent expressions OK, but the **values must not differ**.
+* Rely **only** on Reference Answer, Key Memory Points, and System Response.
+
+# Information for Evaluation
+
+* **Question:**
+  {question}
+
+* **Reference Answer:**
+  {reference_answer}
+
+* **Key Memory Points:**
+  {key_memory_points}
+
+* **Memory System Response:**
+  {response}
+
+# Output Requirements
+
+Output **strictly** the JSON below, no extra text.
+
+```json
+{{
+  "reasoning": "State the core required facts, whether the response got them right, and whether anything directly contradicts the Key Memory Points / Reference Answer (ignore merely-extra non-contradictory detail). Then give the classification basis.",
+  "evaluation_result": "Correct | Hallucination | Omission"
+}}
+```
+"""
+
+
 def evaluation_for_memory_integrity(
     extract_memories: str,
     target_memory: str
@@ -362,7 +422,14 @@ def evaluation_for_question(
     key_memory_points: The memory points used to derive the reference answer.
     response: The answer produced by the memory system.
     """
-    prompt = EVALUATION_PROMPT_FOR_QUESTION.format(
+    # JUDGE_MODE=lenient swaps in the diagnostic lenient rubric (does not
+    # penalize extra non-contradictory detail). Default keeps the official
+    # strict rubric so leaderboard scores stay comparable.
+    import os
+    template = (EVALUATION_PROMPT_FOR_QUESTION_LENIENT
+                if os.environ.get("JUDGE_MODE", "strict").lower() == "lenient"
+                else EVALUATION_PROMPT_FOR_QUESTION)
+    prompt = template.format(
         question=question,
         reference_answer=reference_answer,
         key_memory_points=key_memory_points,