Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,6 @@ data/native_math_gpt-oss-120b-.jsonl
data/eli5.parquet

# eval results
eval/results/*
eval/results/*
eval/data/*
.venv-halumem-download/
62 changes: 62 additions & 0 deletions eval/eval_arika.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import argparse
import json
import os

from prompts import PROMPT_MEMZERO


def _iter_jsonl(path: str):
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
yield json.loads(line)


def _default_llm(prompt: str) -> str:
from llms import llm_request

return llm_request(prompt)


def assemble_results(artifact_path: str, mode: str, llm=_default_llm):
users = []
for user in _iter_jsonl(artifact_path):
for session in user["sessions"]:
arika_qs = session.get("arika_questions", []) or []
for index, qa in enumerate(session.get("questions", []) or []):
arika = arika_qs[index] if index < len(arika_qs) else {}
qa["context"] = arika.get("context", "")
if "search_duration_ms" in arika:
qa["search_duration_ms"] = arika["search_duration_ms"]
if "answer_duration_ms" in arika:
qa["response_duration_ms"] = arika["answer_duration_ms"]
if mode == "e2e":
qa["system_response"] = arika.get("arika_answer", "")
else:
prompt = PROMPT_MEMZERO.format(context=qa["context"], question=qa["question"])
qa["system_response"] = llm(prompt)
session.pop("arika_questions", None)
users.append(user)
return users


def main(artifact_path: str, frame: str, version: str, mode: str):
save_dir = f"results/{frame}-{version}/"
os.makedirs(save_dir, exist_ok=True)
out = os.path.join(save_dir, f"{frame}_eval_results.jsonl")
users = assemble_results(artifact_path, mode=mode)
with open(out, "w", encoding="utf-8") as f:
for user in users:
f.write(json.dumps(user, ensure_ascii=False) + "\n")
print(f"wrote {out}")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--artifact", required=True)
parser.add_argument("--frame", default="arika", choices=["arika", "arika-e2e", "arika-points"])
parser.add_argument("--version", default="smoke")
parser.add_argument("--mode", default="retrieval", choices=["retrieval", "e2e"])
args = parser.parse_args()
main(args.artifact, args.frame, args.version, args.mode)
69 changes: 68 additions & 1 deletion eval/eval_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,66 @@
"""


# Lenient diagnostic variant (NOT the official rubric). Identical to the strict
# judge EXCEPT it does not penalize extra, non-contradictory detail: an answer
# whose core facts match the Reference Answer is Correct even if it volunteers
# additional related information, as long as nothing contradicts the Key Memory
# Points / Reference Answer. Used only to quantify how much "Hallucination"
# under the strict judge is pure over-answering vs genuine contradiction.
EVALUATION_PROMPT_FOR_QUESTION_LENIENT = """You are an **evaluation expert for AI memory system question answering**.
Based **only** on the provided **“Question”**, **“Reference Answer”**, and **“Key Memory Points”**, evaluate whether the **“Memory System Response”** gets the **core answer** right. Classify it as one of **“Correct”**, **“Hallucination”**, or **“Omission.”** Do **not** use any external knowledge. Output **strictly** in the specified JSON format.

# Evaluation Criteria (LENIENT — reward correct core facts, do not punish verbosity)

## 1. Correct
* The response conveys the **core/required facts** of the “Reference Answer” (semantically equivalent on the essential points). Synonyms, paraphrasing, summarization are fine.
* **Additional related details beyond the Key Memory Points are ALLOWED and must NOT be penalized**, as long as they do **not contradict** the “Reference Answer” or “Key Memory Points.” Extra context, elaboration, or volunteered facts that are merely unverified (but not contradicted) do **NOT** make the answer wrong.

## 2. Hallucination
* The response **directly contradicts** the “Reference Answer” or “Key Memory Points” on a factual point (wrong value, wrong entity, wrong time, opposite conclusion).
* OR the “Reference Answer” is *unknown / cannot be determined*, yet the response asserts a **specific definite** answer.
* Extra non-contradictory information is **NOT** a hallucination by itself.

## 3. Omission
* The response **misses a required core element** of the “Reference Answer,” OR explicitly says “don’t know / can’t remember / no related memory” even though the info exists in the Key Memory Points.
* For multi-element questions, judge only on the **core required elements**; extra elements never hurt.

## Priority Rules
* Any direct **contradiction** (or asserting a definite fact when the reference is unknown) → **Hallucination**.
* Else, if a **core required element is missing** → **Omission**.
* Else (core facts correct, only extra non-contradictory detail added) → **Correct**.

## Tolerance
* Numbers/times/units: equivalent expressions OK, but the **values must not differ**.
* Rely **only** on Reference Answer, Key Memory Points, and System Response.

# Information for Evaluation

* **Question:**
{question}

* **Reference Answer:**
{reference_answer}

* **Key Memory Points:**
{key_memory_points}

* **Memory System Response:**
{response}

# Output Requirements

Output **strictly** the JSON below, no extra text.

```json
{{
"reasoning": "State the core required facts, whether the response got them right, and whether anything directly contradicts the Key Memory Points / Reference Answer (ignore merely-extra non-contradictory detail). Then give the classification basis.",
"evaluation_result": "Correct | Hallucination | Omission"
}}
```
"""


def evaluation_for_memory_integrity(
extract_memories: str,
target_memory: str
Expand Down Expand Up @@ -362,7 +422,14 @@ def evaluation_for_question(
key_memory_points: The memory points used to derive the reference answer.
response: The answer produced by the memory system.
"""
prompt = EVALUATION_PROMPT_FOR_QUESTION.format(
# JUDGE_MODE=lenient swaps in the diagnostic lenient rubric (does not
# penalize extra non-contradictory detail). Default keeps the official
# strict rubric so leaderboard scores stay comparable.
import os
template = (EVALUATION_PROMPT_FOR_QUESTION_LENIENT
if os.environ.get("JUDGE_MODE", "strict").lower() == "lenient"
else EVALUATION_PROMPT_FOR_QUESTION)
prompt = template.format(
question=question,
reference_answer=reference_answer,
key_memory_points=key_memory_points,
Expand Down
Loading