diff --git a/scripts/eval.py b/scripts/eval.py index 4fb0a353a..6b77eb37a 100644 --- a/scripts/eval.py +++ b/scripts/eval.py @@ -39,12 +39,19 @@ MAX_MODEL_LENGTH = 4096 MAX_NEW_TOKENS = None MAX_SAMPLES = None +SKIP_SPECIAL_TOKENS = True # vllm default +SPACES_BETWEEN_SPECIAL_TOKENS = True # vllm default # OVERRIDE_CHAT_TEMPLATE = True # False for base, don't forget to change for instruction-tuned models! +# SEED = 1234 # TEMPERATURE = 0.6 # TOP_P = 0.95 -# MAX_MODEL_LENGTH = 32768 -# MAX_NEW_TOKENS = 1024 +# MAX_MODEL_LENGTH = 65536 # 65536 // 32768 +# MAX_NEW_TOKENS = 32768 # 32768 // 16384 +# MAX_SAMPLES = None +# SKIP_SPECIAL_TOKENS = False +# SPACES_BETWEEN_SPECIAL_TOKENS = False + ######## MODEL NAMES ######## @@ -144,6 +151,8 @@ def eval_one(model_name: str, tasks: str): temperature=TEMPERATURE, top_p=TOP_P, max_new_tokens=MAX_NEW_TOKENS, + skip_special_tokens=SKIP_SPECIAL_TOKENS, + spaces_between_special_tokens=SPACES_BETWEEN_SPECIAL_TOKENS, ), ) @@ -179,7 +188,8 @@ def eval_one(model_name: str, tasks: str): print(f"World size: {world}") print(f"{'='*100}\n") - out_dir = Path("results") / _safe_name(model_name) / task / f'{SUBDIR_PREFIX}{_get_git_commit_short()}' + task_dir = RESULTS_DIR_NAME if RESULTS_DIR_NAME else _safe_name(task) + out_dir = Path("results") / _safe_name(model_name) / task_dir / f'{SUBDIR_PREFIX}{_get_git_commit_short()}' out_dir.mkdir(parents=True, exist_ok=True) eval_tracker = EvaluationTracker( output_dir=str(out_dir), diff --git a/src/lighteval/models/model_input.py b/src/lighteval/models/model_input.py index ad41c23eb..a3aa27724 100644 --- a/src/lighteval/models/model_input.py +++ b/src/lighteval/models/model_input.py @@ -46,6 +46,8 @@ class GenerationParameters(BaseModel, extra="forbid"): min_p: NonNegativeFloat | None = None # vllm, transformers, sglang top_p: NonNegativeFloat | None = None # vllm, transformers, tgi, litellm, sglang truncate_prompt: bool | None = None # vllm, tgi + skip_special_tokens: bool | None = None # vllm + spaces_between_special_tokens: bool | None = None # vllm cache_implementation: str | None = None # transformers diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 1f5da9c14..75014f459 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -351,10 +351,12 @@ def _post_process_outputs(self, sampling_method_responses: dict[str, list[ModelR if self.pipeline_parameters.remove_reasoning_tags: for _, responses in sampling_method_responses.items(): for response in responses: + context = response.input if isinstance(response.input, str) else None response.text_post_processed = [ remove_reasoning_tags( text=text, tag_pairs=self.pipeline_parameters.reasoning_tags, + context=context, ) for text in response.text ] diff --git a/src/lighteval/tasks/multilingual/tasks/fusion_aya_math.py b/src/lighteval/tasks/multilingual/tasks/fusion_aya_math.py new file mode 100644 index 000000000..0d88e439f --- /dev/null +++ b/src/lighteval/tasks/multilingual/tasks/fusion_aya_math.py @@ -0,0 +1,167 @@ +""" +name: +Fusion Aya Math Bench + +dataset: +tiny-aya-math-edition/fusion-aya-math-bench + +abstract: +Fusion Aya Math Bench is a multilingual, olympiad-level mathematical reasoning +dataset. Each problem is paired with a high-quality chain-of-thought solution +fused from the reasoning traces of different frontier models, derived from the +open-ended English subset of OlympiadBench. + +Uses CoT-style prompts matching the mgsm.py setup ("Think step by step / +Problem: / Solution:"), with language-specific translations. + +Reports expr_gold_metric (math expression / LaTeX \\boxed{} parser). + +languages: +english, german, french, ukrainian + +tags: +math, multilingual, reasoning, olympiad, chain-of-thought + +paper: +https://arxiv.org/abs/2510.00931 +""" + +from langcodes import standardize_tag + +from lighteval.metrics.metrics import Metrics +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.utils.language import Language + + +_LANGUAGES = [ + Language.ENGLISH, + Language.GERMAN, + Language.FRENCH, + Language.UKRAINIAN, +] + + +# Each entry is: +# instruction, problem_label, solution_label +# +# The instruction means: +# "Solve the following math problem. Think step by step before giving +# the final answer." +_FUSION_AYA_PROMPT_PARTS: dict[Language, tuple[str, str, str]] = { + Language.ENGLISH: ( + "Solve the following math problem. Think step by step before giving the final answer.", + "Problem:", + "Solution:", + ), + Language.GERMAN: ( + "Löse die folgende Mathematikaufgabe. Denke Schritt für Schritt nach, bevor du die endgültige Antwort gibst.", + "Aufgabe:", + "Lösung:", + ), + Language.FRENCH: ( + "Résous le problème de mathématiques suivant. Réfléchis étape par étape avant de donner la réponse finale.", + "Problème:", + "Solution:", + ), + Language.UKRAINIAN: ( + "Розв'яжи наступну математичну задачу. Міркуй покроково, перш ніж дати остаточну відповідь.", + "Задача:", + "Розв'язання:", + ), +} + + +# Extra problem-label variants that models may generate. +# Keep this small: too many stop strings increase accidental truncation risk. +_FUSION_AYA_PROBLEM_LABEL_STOP_ALIASES: dict[Language, list[str]] = { + Language.ENGLISH: ["Problem"], # no-space variant + Language.GERMAN: ["Aufgabe"], # no-space variant + Language.FRENCH: ["Problème"], # no-space variant + Language.UKRAINIAN: ["Задача"], # no-space variant +} + + +def validate_fusion_aya_prompt_parts() -> None: + """Fail fast if a language is missing localized prompt parts.""" + missing_languages = [ + language for language in _LANGUAGES if language not in _FUSION_AYA_PROMPT_PARTS + ] + + if missing_languages: + missing = ", ".join(language.value for language in missing_languages) + raise ValueError(f"Missing Fusion Aya prompt parts for: {missing}") + + +def fusion_aya_cot_template(language: Language) -> str: + """Return the localized Fusion Aya CoT template for a language.""" + try: + instruction, problem_label, solution_label = _FUSION_AYA_PROMPT_PARTS[language] + except KeyError as exc: + raise ValueError( + f"Missing Fusion Aya prompt parts for language: {language.value}" + ) from exc + + return f"{instruction}\n\n{problem_label}\n{{prompt}}\n\n{solution_label}" + + +def fusion_aya_stop_sequences(language: Language) -> list[str]: + """Return stop sequences for a language. + + Stop when the model appears to start a new problem. + + We keep English stops because models sometimes switch back to English even + when the prompt is localized. We also add the current language's problem + label and a few high-value aliases to avoid over-stopping. + """ + try: + _, problem_label, _ = _FUSION_AYA_PROMPT_PARTS[language] + except KeyError as exc: + raise ValueError( + f"Missing Fusion Aya prompt parts for language: {language.value}" + ) from exc + + stop_sequences = [ + "Problem:", + problem_label, + *_FUSION_AYA_PROBLEM_LABEL_STOP_ALIASES.get(language, []), + ] + + # Preserve order while removing duplicates. + return list(dict.fromkeys(stop_sequences)) + + +def fusion_aya_cot_prompt(language: Language): + """Build a CoT prompt function for Fusion Aya Math in the given language.""" + template = fusion_aya_cot_template(language) + + def prompt_fn(line, task_name: str = None): + return Doc( + task_name=task_name, + query=template.format(prompt=line["question"]), + choices=[line["answer"]], + gold_index=0, + ) + + return prompt_fn + + +validate_fusion_aya_prompt_parts() + + +TASKS_TABLE = [ + LightevalTaskConfig( + name=f"fusion_aya_math:{language.value}:gen", + prompt_function=fusion_aya_cot_prompt(language), + hf_repo="tiny-aya-math-edition/fusion-aya-math-bench", + hf_subset=standardize_tag(language.value), + evaluation_splits=("benchmark",), + few_shots_split=None, + generation_size=4096, + metrics=[ + Metrics.expr_gold_metric, + ], + stop_sequence=fusion_aya_stop_sequences(language), + ) + for language in _LANGUAGES +] diff --git a/src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py b/src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py index 200f14f11..0c35d545a 100644 --- a/src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py +++ b/src/lighteval/tasks/multilingual/tasks/mlmm_arc_challenge.py @@ -3,7 +3,7 @@ Mlmm Arc Challenge dataset: -jon-tow/okapi_arc_challenge +alexandrainst/m_arc abstract: ARC (AI2 Reasoning Challenge) is a dataset for question answering that requires @@ -52,76 +52,62 @@ LogLikelihoodAccMetric(normalization=LogProbCharNorm()), ] -# Languages covered by jon-tow/okapi_arc_challenge (English excluded) _LANGUAGES = [ - Language.RUSSIAN, - Language.GERMAN, - Language.CHINESE, - Language.FRENCH, - Language.SPANISH, - Language.ITALIAN, - Language.DUTCH, - Language.VIETNAMESE, - Language.INDONESIAN, Language.ARABIC, - Language.HUNGARIAN, - Language.ROMANIAN, - Language.DANISH, - Language.SLOVAK, - Language.UKRAINIAN, + Language.ARMENIAN, + Language.BASQUE, + Language.BENGALI, Language.CATALAN, - Language.SERBIAN, + Language.CHINESE, Language.CROATIAN, + Language.DANISH, + Language.DUTCH, + # Language.ENGLISH, + Language.FRENCH, + Language.GERMAN, + Language.GUJARATI, Language.HINDI, - Language.BENGALI, - Language.TAMIL, - Language.NEPALI, + Language.HUNGARIAN, + Language.ICELANDIC, + Language.INDONESIAN, + Language.ITALIAN, + Language.KANNADA, Language.MALAYALAM, Language.MARATHI, + Language.NEPALI, + Language.NORWEGIAN, + Language.PORTUGUESE, + Language.ROMANIAN, + Language.RUSSIAN, + Language.SERBIAN, + Language.SLOVAK, + Language.SPANISH, + Language.SWEDISH, + Language.TAMIL, Language.TELUGU, - Language.KANNADA, + Language.UKRAINIAN, + Language.VIETNAMESE, ] - -def _arc_adapter(line): - if "question" in line and "choices" in line: - choices = line["choices"]["text"] - answer_key = line["answerKey"] - else: - choices = [ - line[key] - for key in ("option_a", "option_b", "option_c", "option_d", "option_e") - if line.get(key) - ] - answer_key = line["answer"] - return { - "question": line["instruction"], - "choices": choices, - "gold_idx": int(answer_key) - 1 - if answer_key.isdigit() - else ascii_uppercase.index(answer_key), - } - +def _m_arc_adapter(line): + raw_choices = [line.get(f"option_{letter}") for letter in "abcde"] + choices = [c for c in raw_choices if c is not None] return { - "question": line["question"], + "question": line["instruction"], "choices": choices, - "gold_idx": int(answer_key) - 1 - if answer_key.isdigit() - else ascii_uppercase.index(answer_key), + "gold_idx": ascii_uppercase.index(line["answer"].strip().upper()), } - TASKS_TABLE = [ LightevalTaskConfig( - name=f"mlmm_arc:{language.value}:{suffix}", + name=f"mlmm_arc_challenge:{language.value}:{suffix}", prompt_function=get_mcq_prompt_function( language, - _arc_adapter, + _m_arc_adapter, formulation=formulation, ), - hf_repo="jon-tow/okapi_arc_challenge", + hf_repo="alexandrainst/m_arc", hf_subset=standardize_tag(language.value), - hf_revision="823d5d7bfaf8974a3ab52a825b6cf4903b35dbc4", evaluation_splits=("test",), few_shots_split="train", metrics=metrics, @@ -136,15 +122,14 @@ def _arc_adapter(line): # Greedy variant: MCF-style prompt, generate 1 token, exact match TASKS_TABLE += [ LightevalTaskConfig( - name=f"mlmm_arc:{language.value}:mcf_em", + name=f"mlmm_arc_challenge:{language.value}:mcf_em", prompt_function=get_mcq_prompt_function( language, - _arc_adapter, + _m_arc_adapter, formulation=MCFFormulation(), ), - hf_repo="jon-tow/okapi_arc_challenge", + hf_repo="alexandrainst/m_arc", hf_subset=standardize_tag(language.value), - hf_revision="823d5d7bfaf8974a3ab52a825b6cf4903b35dbc4", evaluation_splits=("test",), few_shots_split="train", generation_size=1, diff --git a/src/lighteval/tasks/multilingual/tasks/xcopa.py b/src/lighteval/tasks/multilingual/tasks/xcopa.py index 9d8507833..06d613a32 100644 --- a/src/lighteval/tasks/multilingual/tasks/xcopa.py +++ b/src/lighteval/tasks/multilingual/tasks/xcopa.py @@ -75,7 +75,7 @@ def _adapter(line): def _hf_repo(language: Language) -> str: - return "OALL/AlGhafa-Arabic-LLM-Benchmark-Translated" if language == Language.ARABIC else "xcopa" + return "OALL/AlGhafa-Arabic-LLM-Benchmark-Translated" if language == Language.ARABIC else "cambridgeltl/xcopa" # Dataset subset codes that differ from standardize_tag(language.value). diff --git a/src/lighteval/tasks/multilingual/tasks/xquad.py b/src/lighteval/tasks/multilingual/tasks/xquad.py index fe74e282e..50fa4b7a5 100644 --- a/src/lighteval/tasks/multilingual/tasks/xquad.py +++ b/src/lighteval/tasks/multilingual/tasks/xquad.py @@ -83,7 +83,7 @@ def fn(line, task_name: str = None): evaluation_splits=("validation",), few_shots_split="validation", generation_size=400, - stop_sequence=("\n",), + stop_sequence=["\n",], metrics=( MultilingualQuasiExactMatchMetric(language, "prefix"), MultilingualQuasiF1ScoreMetric(language), @@ -102,7 +102,7 @@ def fn(line, task_name: str = None): evaluation_splits=("validation",), few_shots_split="validation", generation_size=-1, - stop_sequence=("\n",), + stop_sequence=["\n",], metrics=[Metrics.target_bits_per_byte], ) for language in _LANGUAGES diff --git a/src/lighteval/tasks/tasks/mmlu.py b/src/lighteval/tasks/tasks/mmlu.py index c4b04c8f0..39667d696 100644 --- a/src/lighteval/tasks/tasks/mmlu.py +++ b/src/lighteval/tasks/tasks/mmlu.py @@ -278,7 +278,7 @@ def mmlu_redux_chat_prompt(line, task_name: str = None): evaluation_splits=["test"], few_shots_split="dev", few_shots_select=None, - generation_size=4096, + generation_size=32768, metrics=[Metrics.gpqa_instruct_metric], stop_sequence=["Question:"], version=0, diff --git a/src/lighteval/tasks/tasks/mmlu_pro.py b/src/lighteval/tasks/tasks/mmlu_pro.py index 2ede01f50..609717977 100644 --- a/src/lighteval/tasks/tasks/mmlu_pro.py +++ b/src/lighteval/tasks/tasks/mmlu_pro.py @@ -99,6 +99,7 @@ def record_to_sample(record): hf_revision="3373e0b32277875b8db2aa555a333b78a08477ea", evaluation_splits=("test",), few_shots_split="validation", + generation_size=32768, metrics=[Metrics.gpqa_instruct_metric], ) diff --git a/src/lighteval/utils/utils.py b/src/lighteval/utils/utils.py index 3ab5976d8..c2b91c953 100644 --- a/src/lighteval/utils/utils.py +++ b/src/lighteval/utils/utils.py @@ -276,7 +276,7 @@ def safe_divide(numerator: np.ndarray, denominator: float, default_value: float return np.where(denominator != 0, numerator / denominator, default_value) -def remove_reasoning_tags(text: str, tag_pairs: list[tuple[str, str]]) -> str: +def remove_reasoning_tags(text: str, tag_pairs: list[tuple[str, str]], context: str | None = None) -> str: """Removes all instances of reasoning tag pairs from text. Iteratively removes content between specified start and end tag pairs. @@ -288,6 +288,9 @@ def remove_reasoning_tags(text: str, tag_pairs: list[tuple[str, str]]) -> str: Args: text (str): The input text containing reasoning tags to remove. tag_pairs (list[tuple[str, str]]): List of (start_tag, end_tag) pairs to remove. + context (str | None): Optional preceding text, such as the prompt. If the + context ends inside a reasoning block, generated text before the first + closing tag is treated as reasoning and removed. Returns: str: The text with all reasoning tag content removed. @@ -306,6 +309,14 @@ def remove_reasoning_tags(text: str, tag_pairs: list[tuple[str, str]]) -> str: result = text for start_tag, end_tag in tag_pairs: + if context is not None: + last_start = context.rfind(start_tag) + last_end = context.rfind(end_tag) + if last_start != -1 and last_start > last_end: + end = result.find(end_tag) + if end != -1: + result = result[end + len(end_tag) :] + while start_tag in result and end_tag in result: start = result.find(start_tag) end = result.find(end_tag, start)