Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions scripts/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,19 @@
MAX_MODEL_LENGTH = 4096
MAX_NEW_TOKENS = None
MAX_SAMPLES = None
SKIP_SPECIAL_TOKENS = True # vllm default
SPACES_BETWEEN_SPECIAL_TOKENS = True # vllm default

# OVERRIDE_CHAT_TEMPLATE = True # False for base, don't forget to change for instruction-tuned models!
# SEED = 1234
# TEMPERATURE = 0.6
# TOP_P = 0.95
# MAX_MODEL_LENGTH = 32768
# MAX_NEW_TOKENS = 1024
# MAX_MODEL_LENGTH = 65536 # 65536 // 32768
# MAX_NEW_TOKENS = 32768 # 32768 // 16384
# MAX_SAMPLES = None
# SKIP_SPECIAL_TOKENS = False
# SPACES_BETWEEN_SPECIAL_TOKENS = False


######## MODEL NAMES ########

Expand Down Expand Up @@ -144,6 +151,8 @@ def eval_one(model_name: str, tasks: str):
temperature=TEMPERATURE,
top_p=TOP_P,
max_new_tokens=MAX_NEW_TOKENS,
skip_special_tokens=SKIP_SPECIAL_TOKENS,
spaces_between_special_tokens=SPACES_BETWEEN_SPECIAL_TOKENS,
),
)

Expand Down Expand Up @@ -179,7 +188,8 @@ def eval_one(model_name: str, tasks: str):
print(f"World size: {world}")
print(f"{'='*100}\n")

out_dir = Path("results") / _safe_name(model_name) / task / f'{SUBDIR_PREFIX}{_get_git_commit_short()}'
task_dir = RESULTS_DIR_NAME if RESULTS_DIR_NAME else _safe_name(task)
out_dir = Path("results") / _safe_name(model_name) / task_dir / f'{SUBDIR_PREFIX}{_get_git_commit_short()}'
out_dir.mkdir(parents=True, exist_ok=True)
eval_tracker = EvaluationTracker(
output_dir=str(out_dir),
Expand Down
2 changes: 2 additions & 0 deletions src/lighteval/models/model_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ class GenerationParameters(BaseModel, extra="forbid"):
min_p: NonNegativeFloat | None = None # vllm, transformers, sglang
top_p: NonNegativeFloat | None = None # vllm, transformers, tgi, litellm, sglang
truncate_prompt: bool | None = None # vllm, tgi
skip_special_tokens: bool | None = None # vllm
spaces_between_special_tokens: bool | None = None # vllm

cache_implementation: str | None = None # transformers

Expand Down
2 changes: 2 additions & 0 deletions src/lighteval/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,10 +351,12 @@ def _post_process_outputs(self, sampling_method_responses: dict[str, list[ModelR
if self.pipeline_parameters.remove_reasoning_tags:
for _, responses in sampling_method_responses.items():
for response in responses:
context = response.input if isinstance(response.input, str) else None
response.text_post_processed = [
remove_reasoning_tags(
text=text,
tag_pairs=self.pipeline_parameters.reasoning_tags,
context=context,
)
for text in response.text
]
Expand Down
167 changes: 167 additions & 0 deletions src/lighteval/tasks/multilingual/tasks/fusion_aya_math.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
"""
name:
Fusion Aya Math Bench

dataset:
tiny-aya-math-edition/fusion-aya-math-bench

abstract:
Fusion Aya Math Bench is a multilingual, olympiad-level mathematical reasoning
dataset. Each problem is paired with a high-quality chain-of-thought solution
fused from the reasoning traces of different frontier models, derived from the
open-ended English subset of OlympiadBench.

Uses CoT-style prompts matching the mgsm.py setup ("Think step by step /
Problem: / Solution:"), with language-specific translations.

Reports expr_gold_metric (math expression / LaTeX \\boxed{} parser).

languages:
english, german, french, ukrainian

tags:
math, multilingual, reasoning, olympiad, chain-of-thought

paper:
https://arxiv.org/abs/2510.00931
"""

from langcodes import standardize_tag

from lighteval.metrics.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.utils.language import Language


_LANGUAGES = [
Language.ENGLISH,
Language.GERMAN,
Language.FRENCH,
Language.UKRAINIAN,
]


# Each entry is:
# instruction, problem_label, solution_label
#
# The instruction means:
# "Solve the following math problem. Think step by step before giving
# the final answer."
_FUSION_AYA_PROMPT_PARTS: dict[Language, tuple[str, str, str]] = {
Language.ENGLISH: (
"Solve the following math problem. Think step by step before giving the final answer.",
"Problem:",
"Solution:",
),
Language.GERMAN: (
"Löse die folgende Mathematikaufgabe. Denke Schritt für Schritt nach, bevor du die endgültige Antwort gibst.",
"Aufgabe:",
"Lösung:",
),
Language.FRENCH: (
"Résous le problème de mathématiques suivant. Réfléchis étape par étape avant de donner la réponse finale.",
"Problème:",
"Solution:",
),
Language.UKRAINIAN: (
"Розв'яжи наступну математичну задачу. Міркуй покроково, перш ніж дати остаточну відповідь.",
"Задача:",
"Розв'язання:",
),
}


# Extra problem-label variants that models may generate.
# Keep this small: too many stop strings increase accidental truncation risk.
_FUSION_AYA_PROBLEM_LABEL_STOP_ALIASES: dict[Language, list[str]] = {
Language.ENGLISH: ["Problem"], # no-space variant
Language.GERMAN: ["Aufgabe"], # no-space variant
Language.FRENCH: ["Problème"], # no-space variant
Language.UKRAINIAN: ["Задача"], # no-space variant
}


def validate_fusion_aya_prompt_parts() -> None:
"""Fail fast if a language is missing localized prompt parts."""
missing_languages = [
language for language in _LANGUAGES if language not in _FUSION_AYA_PROMPT_PARTS
]

if missing_languages:
missing = ", ".join(language.value for language in missing_languages)
raise ValueError(f"Missing Fusion Aya prompt parts for: {missing}")


def fusion_aya_cot_template(language: Language) -> str:
"""Return the localized Fusion Aya CoT template for a language."""
try:
instruction, problem_label, solution_label = _FUSION_AYA_PROMPT_PARTS[language]
except KeyError as exc:
raise ValueError(
f"Missing Fusion Aya prompt parts for language: {language.value}"
) from exc

return f"{instruction}\n\n{problem_label}\n{{prompt}}\n\n{solution_label}"


def fusion_aya_stop_sequences(language: Language) -> list[str]:
"""Return stop sequences for a language.

Stop when the model appears to start a new problem.

We keep English stops because models sometimes switch back to English even
when the prompt is localized. We also add the current language's problem
label and a few high-value aliases to avoid over-stopping.
"""
try:
_, problem_label, _ = _FUSION_AYA_PROMPT_PARTS[language]
except KeyError as exc:
raise ValueError(
f"Missing Fusion Aya prompt parts for language: {language.value}"
) from exc

stop_sequences = [
"Problem:",
problem_label,
*_FUSION_AYA_PROBLEM_LABEL_STOP_ALIASES.get(language, []),
]

# Preserve order while removing duplicates.
return list(dict.fromkeys(stop_sequences))


def fusion_aya_cot_prompt(language: Language):
"""Build a CoT prompt function for Fusion Aya Math in the given language."""
template = fusion_aya_cot_template(language)

def prompt_fn(line, task_name: str = None):
return Doc(
task_name=task_name,
query=template.format(prompt=line["question"]),
choices=[line["answer"]],
gold_index=0,
)

return prompt_fn


validate_fusion_aya_prompt_parts()


TASKS_TABLE = [
LightevalTaskConfig(
name=f"fusion_aya_math:{language.value}:gen",
prompt_function=fusion_aya_cot_prompt(language),
hf_repo="tiny-aya-math-edition/fusion-aya-math-bench",
hf_subset=standardize_tag(language.value),
evaluation_splits=("benchmark",),
few_shots_split=None,
generation_size=4096,
metrics=[
Metrics.expr_gold_metric,
],
stop_sequence=fusion_aya_stop_sequences(language),
)
for language in _LANGUAGES
]
93 changes: 39 additions & 54 deletions src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Mlmm Arc Challenge

dataset:
jon-tow/okapi_arc_challenge
alexandrainst/m_arc

abstract:
ARC (AI2 Reasoning Challenge) is a dataset for question answering that requires
Expand Down Expand Up @@ -52,76 +52,62 @@
LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
]

# Languages covered by jon-tow/okapi_arc_challenge (English excluded)
_LANGUAGES = [
Language.RUSSIAN,
Language.GERMAN,
Language.CHINESE,
Language.FRENCH,
Language.SPANISH,
Language.ITALIAN,
Language.DUTCH,
Language.VIETNAMESE,
Language.INDONESIAN,
Language.ARABIC,
Language.HUNGARIAN,
Language.ROMANIAN,
Language.DANISH,
Language.SLOVAK,
Language.UKRAINIAN,
Language.ARMENIAN,
Language.BASQUE,
Language.BENGALI,
Language.CATALAN,
Language.SERBIAN,
Language.CHINESE,
Language.CROATIAN,
Language.DANISH,
Language.DUTCH,
# Language.ENGLISH,
Language.FRENCH,
Language.GERMAN,
Language.GUJARATI,
Language.HINDI,
Language.BENGALI,
Language.TAMIL,
Language.NEPALI,
Language.HUNGARIAN,
Language.ICELANDIC,
Language.INDONESIAN,
Language.ITALIAN,
Language.KANNADA,
Language.MALAYALAM,
Language.MARATHI,
Language.NEPALI,
Language.NORWEGIAN,
Language.PORTUGUESE,
Language.ROMANIAN,
Language.RUSSIAN,
Language.SERBIAN,
Language.SLOVAK,
Language.SPANISH,
Language.SWEDISH,
Language.TAMIL,
Language.TELUGU,
Language.KANNADA,
Language.UKRAINIAN,
Language.VIETNAMESE,
]


def _arc_adapter(line):
if "question" in line and "choices" in line:
choices = line["choices"]["text"]
answer_key = line["answerKey"]
else:
choices = [
line[key]
for key in ("option_a", "option_b", "option_c", "option_d", "option_e")
if line.get(key)
]
answer_key = line["answer"]
return {
"question": line["instruction"],
"choices": choices,
"gold_idx": int(answer_key) - 1
if answer_key.isdigit()
else ascii_uppercase.index(answer_key),
}

def _m_arc_adapter(line):
raw_choices = [line.get(f"option_{letter}") for letter in "abcde"]
choices = [c for c in raw_choices if c is not None]
return {
"question": line["question"],
"question": line["instruction"],
"choices": choices,
"gold_idx": int(answer_key) - 1
if answer_key.isdigit()
else ascii_uppercase.index(answer_key),
"gold_idx": ascii_uppercase.index(line["answer"].strip().upper()),
}


TASKS_TABLE = [
LightevalTaskConfig(
name=f"mlmm_arc:{language.value}:{suffix}",
name=f"mlmm_arc_challenge:{language.value}:{suffix}",
prompt_function=get_mcq_prompt_function(
language,
_arc_adapter,
_m_arc_adapter,
formulation=formulation,
),
hf_repo="jon-tow/okapi_arc_challenge",
hf_repo="alexandrainst/m_arc",
hf_subset=standardize_tag(language.value),
hf_revision="823d5d7bfaf8974a3ab52a825b6cf4903b35dbc4",
evaluation_splits=("test",),
few_shots_split="train",
metrics=metrics,
Expand All @@ -136,15 +122,14 @@ def _arc_adapter(line):
# Greedy variant: MCF-style prompt, generate 1 token, exact match
TASKS_TABLE += [
LightevalTaskConfig(
name=f"mlmm_arc:{language.value}:mcf_em",
name=f"mlmm_arc_challenge:{language.value}:mcf_em",
prompt_function=get_mcq_prompt_function(
language,
_arc_adapter,
_m_arc_adapter,
formulation=MCFFormulation(),
),
hf_repo="jon-tow/okapi_arc_challenge",
hf_repo="alexandrainst/m_arc",
hf_subset=standardize_tag(language.value),
hf_revision="823d5d7bfaf8974a3ab52a825b6cf4903b35dbc4",
evaluation_splits=("test",),
few_shots_split="train",
generation_size=1,
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/tasks/multilingual/tasks/xcopa.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def _adapter(line):


def _hf_repo(language: Language) -> str:
return "OALL/AlGhafa-Arabic-LLM-Benchmark-Translated" if language == Language.ARABIC else "xcopa"
return "OALL/AlGhafa-Arabic-LLM-Benchmark-Translated" if language == Language.ARABIC else "cambridgeltl/xcopa"


# Dataset subset codes that differ from standardize_tag(language.value).
Expand Down
4 changes: 2 additions & 2 deletions src/lighteval/tasks/multilingual/tasks/xquad.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def fn(line, task_name: str = None):
evaluation_splits=("validation",),
few_shots_split="validation",
generation_size=400,
stop_sequence=("\n",),
stop_sequence=["\n",],
metrics=(
MultilingualQuasiExactMatchMetric(language, "prefix"),
MultilingualQuasiF1ScoreMetric(language),
Expand All @@ -102,7 +102,7 @@ def fn(line, task_name: str = None):
evaluation_splits=("validation",),
few_shots_split="validation",
generation_size=-1,
stop_sequence=("\n",),
stop_sequence=["\n",],
metrics=[Metrics.target_bits_per_byte],
)
for language in _LANGUAGES
Expand Down
Loading