From d25d065394a00c71c70fc9d3d511bc27253e4fe0 Mon Sep 17 00:00:00 2001 From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:55:55 +0000 Subject: [PATCH 1/6] evaluator: add LMMSEvaluator and ORT-GenAI multimodal adapter Adds LMMSEvaluator (olive/evaluator/olive_evaluator.py) and an ORT-GenAI multimodal adapter (olive/evaluator/lmms_ort.py) for evaluating multimodal ONNX models via lmms-eval. --- olive/evaluator/lmms_ort.py | 546 +++++++++++++++++++ olive/evaluator/olive_evaluator.py | 249 +++++++++ setup.py | 3 + test/evaluator/test_lmms_ort.py | 837 +++++++++++++++++++++++++++++ 4 files changed, 1635 insertions(+) create mode 100644 olive/evaluator/lmms_ort.py create mode 100644 test/evaluator/test_lmms_ort.py diff --git a/olive/evaluator/lmms_ort.py b/olive/evaluator/lmms_ort.py new file mode 100644 index 0000000000..63fc67fdb8 --- /dev/null +++ b/olive/evaluator/lmms_ort.py @@ -0,0 +1,546 @@ +# ------------------------------------------------------------------------- +# lmms-eval adapter for Olive-exported multimodal models. +# +# Added locally (not upstream) to support evaluating quantized multimodal ONNX +# models through the EvolvingLMMs-Lab/lmms-eval harness, mirroring how +# olive/evaluator/lmeval_ort.py wraps lm-evaluation-harness for text models. +# +# Registers an LMMSORTGenAIEvaluator class with lmms-eval's legacy +# @register_model registry under the name "ortgenai_mm". Consumers obtain it +# via lmms_eval.api.registry.get_model("ortgenai_mm"). +# ------------------------------------------------------------------------- +"""lmms-eval ORT-GenAI adapter for Olive-exported multimodal models.""" + +from __future__ import annotations + +import io +import json +import logging +import tempfile +from pathlib import Path +from typing import Any + +import numpy as np +import PIL.Image +from tqdm import tqdm + +try: + import onnxruntime_genai as og +except ImportError: # pragma: no cover - optional dep + og = None + +try: + from lmms_eval.api.instance import Instance + from lmms_eval.api.model import lmms + from lmms_eval.api.registry import register_model + + _LMMS_EVAL_IMPORT_ERROR = None +except ImportError as e: # pragma: no cover - optional dep + Instance = Any + _LMMS_EVAL_IMPORT_ERROR = e + + class lmms: # noqa: N801 + pass + + def register_model(_name): + def decorator(cls): + return cls + + return decorator + + +logger = logging.getLogger(__name__) + + +_PROVIDER_ALIASES = { + "cuda": "cuda", + "cudaexecutionprovider": "cuda", + "gpu": "cuda", + "cpu": "cpu", + "cpuexecutionprovider": "cpu", + "dml": "dml", + "dmlexecutionprovider": "dml", + "directml": "dml", + "webgpu": "webgpu", + "webgpuexecutionprovider": "webgpu", + "js": "web", + "jsexecutionprovider": "web", + "nvtensorrtrtx": "NvTensorRtRtx", + "nvtensorrtrtxexecutionprovider": "NvTensorRtRtx", +} + + +# ----------------------------------------------------------------------------- +# Helpers +# ----------------------------------------------------------------------------- + + +def _normalize_image(visual) -> PIL.Image.Image | None: + if isinstance(visual, PIL.Image.Image): + return visual.convert("RGB") + if isinstance(visual, (str, Path)): + p = Path(visual) + if p.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif"}: + return PIL.Image.open(p).convert("RGB") + return None + if isinstance(visual, dict): + if "bytes" in visual: + return PIL.Image.open(io.BytesIO(visual["bytes"])).convert("RGB") + if "path" in visual: + return PIL.Image.open(visual["path"]).convert("RGB") + if isinstance(visual, np.ndarray): + return PIL.Image.fromarray(np.uint8(visual)).convert("RGB") + return None + + +def _normalize_audio(visual) -> tuple[np.ndarray, int] | None: + if isinstance(visual, dict) and "array" in visual and "sampling_rate" in visual: + return np.asarray(visual["array"], dtype=np.float32), int(visual["sampling_rate"]) + if isinstance(visual, (str, Path)): + p = Path(visual) + if p.suffix.lower() in {".wav", ".mp3", ".flac", ".ogg", ".m4a"}: + try: + import librosa + except ImportError: + logger.warning("Audio file %s encountered but librosa not installed.", p) + return None + arr, sr = librosa.load(str(p), sr=None, mono=True) + return arr.astype(np.float32), int(sr) + return None + + +def _partition_visuals(visuals): + images, audios = [], [] + for v in visuals or []: + if v is None: + continue + img = _normalize_image(v) + if img is not None: + images.append(img) + continue + au = _normalize_audio(v) + if au is not None: + audios.append(au) + return images, audios + + +def _format_media_tokens(num_items: int, token_format: str) -> list[str]: + return [token_format.format(index=i + 1, zero_index=i) for i in range(num_items)] + + +def _build_prompt( + model_type: str, + num_images: int, + num_audios: int, + user_text: str, + system_prompt: str = "You are a helpful AI assistant.", + prompt_template: str | None = None, + image_token_format: str = "<|image_{index}|>", + audio_token_format: str = "<|audio_{index}|>", +) -> str: + """Build a Phi-4-multimodal-style chat prompt. + + Other multimodal architectures use different placeholder tags. Users can + override the media token formats and the full prompt template from the Olive + evaluator config without changing this adapter. + """ + image_tokens = "".join(_format_media_tokens(num_images, image_token_format)) + audio_tokens = "".join(_format_media_tokens(num_audios, audio_token_format)) + parts = [image_tokens, audio_tokens, user_text] + user_content = "".join(parts) + if prompt_template: + return prompt_template.format( + system_prompt=system_prompt, + user_content=user_content, + text=user_text, + image_tokens=image_tokens, + audio_tokens=audio_tokens, + model_type=model_type, + ) + + return f"<|system|>{system_prompt}<|end|><|user|>{user_content}<|end|><|assistant|>" + + +def _normalize_execution_provider(execution_provider: Any | None) -> str: + if not execution_provider: + return "follow_config" + if isinstance(execution_provider, (tuple, list)): + execution_provider = execution_provider[0] if execution_provider else None + if not execution_provider: + return "follow_config" + ep = str(execution_provider).lower().replace("_", "") + return _PROVIDER_ALIASES.get(ep, str(execution_provider)) + + +# ----------------------------------------------------------------------------- +# Main adapter +# ----------------------------------------------------------------------------- + + +@register_model("ortgenai_mm") +class LMMSORTGenAIEvaluator(lmms): + r"""lmms-eval model wrapper for an ORT-GenAI multimodal package. + + Example:: + + lmms_eval --model ortgenai_mm \\ + --model_args pretrained=/path/to/ort_genai_dir,batch_size=1 \\ + --tasks mmmu_val --limit 4 + """ + + is_simple = True + + def __init__( + self, + pretrained: str, + batch_size: int = 1, + max_new_tokens: int = 256, + max_length: int = 8192, + system_prompt: str = "You are a helpful AI assistant.", + execution_provider: str | None = None, + provider_options: dict | None = None, + fail_on_error: bool = True, + prompt_template: str | None = None, + image_token_format: str = "<|image_{index}|>", + audio_token_format: str = "<|audio_{index}|>", + **kwargs, + ) -> None: + if _LMMS_EVAL_IMPORT_ERROR is not None: + raise ImportError( + "lmms-eval is required for ortgenai_mm. Install lmms-eval before using LMMSEvaluator." + ) from _LMMS_EVAL_IMPORT_ERROR + if og is None: + raise ImportError( + "onnxruntime-genai is required for ortgenai_mm. " + "Install with: pip install onnxruntime-genai (or -cuda variant)." + ) + super().__init__() + if kwargs: + logger.warning("Unused kwargs: %s", kwargs) + + model_dir = Path(pretrained).resolve() + if not model_dir.is_dir(): + raise ValueError(f"ORT-GenAI model directory does not exist: {model_dir}") + if not (model_dir / "genai_config.json").is_file(): + raise ValueError(f"LMMSEvaluator requires genai_config.json in ORT-GenAI package: {model_dir}") + if int(batch_size) < 1: + raise ValueError("batch_size must be >= 1") + if int(max_new_tokens) < 1: + raise ValueError("max_new_tokens must be >= 1") + if int(max_length) < 1: + raise ValueError("max_length must be >= 1") + + self.model_dir = str(model_dir) + self.max_new_tokens = int(max_new_tokens) + self.max_length = int(max_length) + self.batch_size_per_gpu = int(batch_size) + self.system_prompt = system_prompt + self.fail_on_error = fail_on_error + self.prompt_template = prompt_template + self.image_token_format = image_token_format + self.audio_token_format = audio_token_format + + logger.info("Loading ORT-GenAI model from: %s", self.model_dir) + ep = _normalize_execution_provider(execution_provider) + # CUDA GenAI packages often carry provider-specific options in genai_config.json. + # Clearing/re-adding CUDA can drop those options and fail to load on otherwise + # working packages, so follow the package config unless options are overridden. + if ep == "follow_config" or (ep == "cuda" and not provider_options): + self._model = og.Model(self.model_dir) + else: + config = og.Config(self.model_dir) + config.clear_providers() + if ep != "cpu": + config.append_provider(ep) + for key, value in (provider_options or {}).items(): + config.set_provider_option(ep, key, value) + self._model = og.Model(config) + self._tokenizer = og.Tokenizer(self._model) + self._processor = self._model.create_multimodal_processor() + + eos_ids = self._tokenizer.eos_token_ids + self._eos_token_ids = {int(t) for t in (eos_ids if eos_ids is not None else [])} + + try: + cfg = json.loads((Path(self.model_dir) / "genai_config.json").read_text(encoding="utf-8")) + self._model_type = cfg.get("model", {}).get("type", "phi4mm") + except json.JSONDecodeError as e: + raise ValueError(f"Invalid genai_config.json in {self.model_dir}") from e + + self._rank = 0 + self._world_size = 1 + logger.info("Model loaded. Model type: %s", self._model_type) + + # ------------------------------------------------------------------------- + # lmms-eval required properties + # ------------------------------------------------------------------------- + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + # ------------------------------------------------------------------------- + # ORT-GenAI input plumbing + # ------------------------------------------------------------------------- + def _build_og_images(self, images, tmp_dir: Path): + if not images: + return None + paths = [] + for i, img in enumerate(images): + path = tmp_dir / f"image_{i}.png" + img.save(path, format="PNG") + paths.append(str(path)) + return og.Images.open(*paths) + + def _build_og_audios(self, audios, tmp_dir: Path): + if not audios: + return None + import soundfile as sf + + paths = [] + for i, (arr, sr) in enumerate(audios): + path = tmp_dir / f"audio_{i}.wav" + sf.write(path, arr, sr) + paths.append(str(path)) + return og.Audios.open(*paths) + + def _handle_error(self, message: str, exc: Exception, default): + if self.fail_on_error: + raise RuntimeError(message) from exc + logger.exception("%s", message) + return default + + # ------------------------------------------------------------------------- + # Single-request inference primitives + # ------------------------------------------------------------------------- + def _run_generation( + self, prompt: str, images, audios, max_new_tokens: int, stop_strings: list[str] | None = None + ) -> str: + params = og.GeneratorParams(self._model) + # `max_length` is total (prompt + completion). Image prompts can be huge + # (Phi-4-MM image embeds are 1000+ tokens), so default generously. + params.set_search_options( + max_length=self.max_length, + do_sample=False, + ) + generator = og.Generator(self._model, params) + + with tempfile.TemporaryDirectory() as tmp: + tmp_dir = Path(tmp) + og_images = self._build_og_images(images, tmp_dir) + og_audios = self._build_og_audios(audios, tmp_dir) + + try: + inputs = self._processor(prompt, images=og_images, audios=og_audios) + except Exception as e: # pragma: no cover + del generator + return self._handle_error("ORT-GenAI multimodal processor failed.", e, "") + + try: + generator.set_inputs(inputs) + except RuntimeError as e: + del generator + return self._handle_error( + "ORT-GenAI generator input setup failed. The prompt may exceed max_length.", e, "" + ) + + decoded = "" + stream = self._tokenizer.create_stream() + steps = 0 + while not generator.is_done() and steps < max_new_tokens: + generator.generate_next_token() + tok = int(generator.get_next_tokens()[0]) + if tok in self._eos_token_ids: + break + decoded += stream.decode(tok) + if stop_strings: + for s in stop_strings: + if s in decoded: + decoded = decoded.split(s, 1)[0] + del generator + return decoded + steps += 1 + + del generator + return decoded + + def _score_continuation(self, prompt: str, continuation: str, images, audios) -> tuple[float, bool]: + # Tokenize prompt and prompt+continuation jointly, then slice to obtain + # the continuation token IDs as they would actually appear extending the + # prompt. Critical for sentencepiece/BPE tokenizers where ``encode("A")`` + # differs from the suffix of ``encode("prompt A")`` (leading-space + # handling, BOS injection, etc.). + prompt_tokens = list(self._tokenizer.encode(prompt)) + full_tokens = list(self._tokenizer.encode(prompt + continuation)) + cont_tokens = full_tokens[len(prompt_tokens) :] + if len(cont_tokens) == 0: + return 0.0, True + + params = og.GeneratorParams(self._model) + # `max_length` is total (prompt + completion) including image-embed tokens. + params.set_search_options( + max_length=self.max_length, + do_sample=False, + ) + generator = og.Generator(self._model, params) + + with tempfile.TemporaryDirectory() as tmp: + tmp_dir = Path(tmp) + og_images = self._build_og_images(images, tmp_dir) + og_audios = self._build_og_audios(audios, tmp_dir) + try: + inputs = self._processor(prompt, images=og_images, audios=og_audios) + except Exception as e: # pragma: no cover + del generator + return self._handle_error("ORT-GenAI multimodal processor failed in loglikelihood.", e, (-1e9, False)) + + try: + generator.set_inputs(inputs) + except RuntimeError as e: + del generator + return self._handle_error("ORT-GenAI generator input setup failed in loglikelihood.", e, (-1e9, False)) + + # ORT-GenAI's ``set_inputs`` only loads the prompt + multimodal embeds; + # it does NOT run the decoder forward pass. ``get_logits()`` therefore + # returns an undefined buffer before any compute step. Trigger the + # prompt-fill forward pass with ``generate_next_token()`` (the sampled + # token is discarded via ``rewind_to`` after the first scoring + # iteration, before our chosen continuation token is appended). + token_count_after_prefill = generator.token_count() + generator.generate_next_token() + + total_logprob = 0.0 + all_greedy = True + for i, tok_id in enumerate(cont_tokens): + if generator.is_done(): + total_logprob += -50.0 + all_greedy = False + continue + logits = np.asarray(generator.get_logits(), dtype=np.float64).reshape(-1) + if tok_id >= logits.shape[0]: + del generator + raise ValueError(f"Token id {tok_id} is outside logits vocabulary size {logits.shape[0]}.") + log_denom = np.logaddexp.reduce(logits) + total_logprob += float(logits[tok_id] - log_denom) + if int(np.argmax(logits)) != tok_id: + all_greedy = False + if i == 0: + # Drop the throwaway token sampled by ``generate_next_token`` + # above so ``append_tokens`` lands at end-of-prompt + cont[0], + # not end-of-prompt + sampled + cont[0]. + generator.rewind_to(token_count_after_prefill) + generator.append_tokens(np.array([tok_id], dtype=np.int32)) + + del generator + return total_logprob, all_greedy + + def _get_doc_and_visuals(self, doc_to_visual, doc_id, task, split): + try: + doc = self.task_dict[task][split][doc_id] + except (KeyError, IndexError, TypeError) as e: + raise KeyError( + f"Failed to find lmms-eval document task={task!r}, split={split!r}, doc_id={doc_id!r}" + ) from e + + visuals = doc_to_visual(doc) if doc_to_visual else [] + if visuals is None: + visuals = [] + if not isinstance(visuals, list): + visuals = [visuals] + return doc, visuals + + # ------------------------------------------------------------------------- + # lmms-eval Model interface + # ------------------------------------------------------------------------- + def generate_until(self, requests: list[Instance], disable_tqdm: bool = False) -> list[str]: + results = [] + pbar = tqdm(total=len(requests), desc="ortgenai_mm generate_until", disable=disable_tqdm) + for req in requests: + contexts, gen_kwargs, doc_to_visual, doc_id, task, split = req.args + _, visuals = self._get_doc_and_visuals(doc_to_visual, doc_id, task, split) + images, audios = _partition_visuals(visuals) + + gen_kwargs = gen_kwargs or {} + max_new = int(gen_kwargs.get("max_new_tokens", self.max_new_tokens)) + stop = gen_kwargs.get("until", None) + if isinstance(stop, str): + stop = [stop] + + prompt = _build_prompt( + self._model_type, + len(images), + len(audios), + contexts, + self.system_prompt, + self.prompt_template, + self.image_token_format, + self.audio_token_format, + ) + text = self._run_generation(prompt, images, audios, max_new, stop) + results.append(text) + self.cache_hook.add_partial("generate_until", (contexts, gen_kwargs), text) + pbar.update(1) + pbar.close() + return results + + def loglikelihood(self, requests: list[Instance], disable_tqdm: bool = False) -> list[tuple[float, bool]]: + results = [] + pbar = tqdm(total=len(requests), desc="ortgenai_mm loglikelihood", disable=disable_tqdm) + for req in requests: + contexts, doc_to_target, doc_to_visual, doc_id, task, split = req.args + doc, visuals = self._get_doc_and_visuals(doc_to_visual, doc_id, task, split) + images, audios = _partition_visuals(visuals) + continuation = str(doc_to_target(doc)) + + prompt = _build_prompt( + self._model_type, + len(images), + len(audios), + contexts, + self.system_prompt, + self.prompt_template, + self.image_token_format, + self.audio_token_format, + ) + logprob, is_greedy = self._score_continuation(prompt, continuation, images, audios) + results.append((logprob, is_greedy)) + self.cache_hook.add_partial("loglikelihood", (contexts, continuation), (logprob, is_greedy)) + pbar.update(1) + pbar.close() + return results + + def generate_until_multi_round(self, requests) -> list[str]: + raise NotImplementedError("ortgenai_mm does not support lmms-eval multi-round generation yet.") + + +# ----------------------------------------------------------------------------- +# lmms-eval MODEL_REGISTRY_V2 entry-point factory. +# +# Exposed via setup.py entry_points["lmms_eval.models"], so a fresh install of +# olive-ai makes ``--model ortgenai_mm`` discoverable from the lmms-eval CLI +# (e.g. ``python -m lmms_eval --model ortgenai_mm ...``) without requiring the +# caller to import this module first. +# +# lmms-eval's ``ModelRegistryV2.load_entrypoint_manifests`` accepts a +# ``Callable`` payload, so we keep the import of ``ModelManifest`` lazy. That +# way ``olive`` (and the rest of this module) stays importable when lmms-eval +# is not installed. +# ----------------------------------------------------------------------------- +def _model_manifest(): + """Return the lmms-eval ModelManifest for ``ortgenai_mm``. + + Used as an entry-point payload for lmms-eval's MODEL_REGISTRY_V2. + """ + from lmms_eval.models.registry_v2 import ModelManifest + + return ModelManifest( + model_id="ortgenai_mm", + simple_class_path="olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator", + ) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index 96378de129..af73fabe64 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -2010,6 +2010,255 @@ def evaluate( return flatten_metric_result(metrics) +@Registry.register("LMMSEvaluator") +class LMMSEvaluator(OliveEvaluator): + """Evaluator for multimodal models using lmms-eval (EvolvingLMMs-Lab/lmms-eval). + + Supports two model handler types: + + 1. :class:`ONNXModelHandler` whose path is an ORT-GenAI multimodal package + (directory containing ``genai_config.json`` plus quantized ONNX files, + typically produced by ``MobiusBuilder`` + ``OnnxKQuantQuantization``). + Dispatches to the ``ortgenai_mm`` adapter in :mod:`olive.evaluator.lmms_ort`. + + 2. :class:`HfModelHandler` for HuggingFace PyTorch multimodal models. + Dispatches to lmms-eval's native wrapper for the model architecture + (e.g. ``phi4_multimodal``, ``qwen2_5_vl``). The wrapper is auto-detected + from the HF ``model_type`` field; pass ``model_class`` in the recipe to + override. + + Raw single-file ONNX models are intentionally not supported: multimodal + inference requires the vision/audio preprocessing pipeline that ORT-GenAI's + multimodal processor provides; a bare ``onnxruntime.InferenceSession`` + cannot do image or audio tokenization on its own. + + Example recipe config:: + + "evaluators": { + "evaluator": { + "type": "LMMSEvaluator", + "tasks": ["ai2d_lite", "ocrbench"], + "batch_size": 1, + "limit": 4 + } + }, + "evaluator": "evaluator" + """ + + # HuggingFace model_type -> lmms-eval model_class (canonical id). + # Covers the multimodal architectures most relevant to Olive sweeps; other + # architectures still work if ``model_class`` is set explicitly in the + # recipe. Verified against lmms-eval's AVAILABLE_SIMPLE_MODELS / + # AVAILABLE_CHAT_TEMPLATE_MODELS registries. + _HF_MODEL_TYPE_TO_LMMS_CLASS: ClassVar[dict[str, str]] = { + "phi4mm": "phi4_multimodal", + "phi3_v": "phi3v", + "qwen2_vl": "qwen2_vl", + "qwen2_5_vl": "qwen2_5_vl", + "qwen3_vl": "qwen3_vl", + "qwen2_audio": "qwen2_audio", + "qwen2_5_omni": "qwen2_5_omni", + "qwen3_omni": "qwen3_omni", + "whisper": "whisper", + "gemma3": "gemma3", + "minicpm_o": "minicpm_o", + "llava": "llava", + "llava_onevision": "llava_onevision", + "internvl_chat": "internvl", + } + + def __init__(self, tasks: list[str], **kwargs): + super().__init__(**kwargs) + self.tasks = tasks + self.limit = kwargs.get("limit") + self.model_class = kwargs.get("model_class") + self.batch_size = kwargs.get("batch_size", 1) + self.max_new_tokens = kwargs.get("max_new_tokens", 256) + self.max_length = kwargs.get("max_length", 32768) + self.system_prompt = kwargs.get("system_prompt", "You are a helpful AI assistant.") + self.ep = kwargs.get("execution_provider") + self.ep_options = kwargs.get("provider_options") + self.log_samples = bool(kwargs.get("log_samples", False)) + self.output_path = kwargs.get("output_path") + self.fail_on_error = bool(kwargs.get("fail_on_error", True)) + self.prompt_template = kwargs.get("prompt_template") + self.image_token_format = kwargs.get("image_token_format", "<|image_{index}|>") + self.audio_token_format = kwargs.get("audio_token_format", "<|audio_{index}|>") + # HF-only knobs (forwarded to lmms-eval's native wrapper if present). + # ``trust_remote_code`` defaults to False to match the rest of Olive + # (e.g. olive/common/hf/utils.py, olive/data/component/load_dataset.py) + # and avoid silently executing arbitrary Hub code at load time. Users + # who need Phi-4-MM, MiniCPM-o, etc. opt in explicitly via the recipe. + self.dtype = kwargs.get("dtype", "auto") + self.trust_remote_code = bool(kwargs.get("trust_remote_code", False)) + self.hf_model_kwargs = kwargs.get("hf_model_kwargs") or {} + + @staticmethod + def _resolve_model_dir(model: ONNXModelHandler) -> Path: + model_path = Path(model.model_path) + return model_path if model_path.is_dir() else model_path.parent + + @staticmethod + def _resolve_execution_provider(execution_providers: Optional[Union[str, list[str]]]): + if not execution_providers: + return None + if isinstance(execution_providers, list): + return execution_providers[0] if execution_providers else None + return execution_providers + + @staticmethod + def _device_for_hf(device: Device) -> str: + # lmms-eval's HF wrappers accept "cuda", "cpu", or a torch.device. + return "cuda" if device == Device.GPU else "cpu" + + def _build_ortgenai_mm_lm( + self, + model: ONNXModelHandler, + execution_providers: Optional[Union[str, list[str]]], + ): + from olive.evaluator.lmms_ort import LMMSORTGenAIEvaluator + + genai_config = self._resolve_model_dir(model) / "genai_config.json" + if not genai_config.exists(): + raise ValueError( + "LMMSEvaluator requires an ORT-GenAI package " + "(directory containing genai_config.json) for ONNXModelHandler input. " + f"Got ONNXModelHandler without genai_config at {model.model_path}. " + "Raw single-file ONNX is not supported for multimodal evaluation because " + "the vision/audio preprocessing pipeline lives in ORT-GenAI's multimodal " + "processor; use HfModelHandler or an ORT-GenAI package instead." + ) + + model_dir = str(self._resolve_model_dir(model)) + logger.info("Running lmms-eval (model_class=ortgenai_mm, model_dir=%s)", model_dir) + return LMMSORTGenAIEvaluator( + pretrained=model_dir, + batch_size=self.batch_size, + max_new_tokens=self.max_new_tokens, + max_length=self.max_length, + system_prompt=self.system_prompt, + execution_provider=self.ep or self._resolve_execution_provider(execution_providers), + provider_options=self.ep_options, + fail_on_error=self.fail_on_error, + prompt_template=self.prompt_template, + image_token_format=self.image_token_format, + audio_token_format=self.audio_token_format, + ) + + def _resolve_hf_model_class(self, model: HfModelHandler) -> str: + if self.model_class: + return self.model_class + hf_model_type = model.get_hf_model_type() + lmms_class = self._HF_MODEL_TYPE_TO_LMMS_CLASS.get(hf_model_type) + if not lmms_class: + raise ValueError( + f"Could not auto-detect lmms-eval model_class for HF model_type={hf_model_type!r}. " + f"Pass 'model_class' in the evaluator config (e.g. one of " + f"{sorted(self._HF_MODEL_TYPE_TO_LMMS_CLASS.values())}, or any other " + f"name registered with lmms-eval)." + ) + return lmms_class + + def _build_hf_lm(self, model: HfModelHandler, device: Device): + import inspect + + from lmms_eval.models import get_model as lmms_get_model + + model_class = self._resolve_hf_model_class(model) + lm_cls = lmms_get_model(model_class) + + # lmms-eval wrappers have inconsistent constructor signatures: phi4_multimodal + # accepts dtype/trust_remote_code as named params, qwen2_5_vl asserts + # ``kwargs == {}`` at runtime even though it has ``**kwargs`` in its signature. + # Because the signature alone cannot tell "absorbs unknown kwargs" from + # "rejects unknown kwargs at runtime", we conservatively only forward + # ``device/dtype/trust_remote_code`` to wrappers that name them explicitly + # as parameters. For wrappers that take a different set of kwargs (e.g. + # backend-specific knobs), users pass them through ``hf_model_kwargs``, + # which is always forwarded unfiltered as an explicit user opt-in. + try: + accepted = set(inspect.signature(lm_cls.__init__).parameters) + except (TypeError, ValueError): + accepted = set() + + optional_kwargs = { + "device": self._device_for_hf(device), + "dtype": self.dtype, + "trust_remote_code": self.trust_remote_code, + } + forwarded = {k: v for k, v in optional_kwargs.items() if k in accepted} + + init_kwargs = { + "pretrained": str(model.model_name_or_path), + "batch_size": self.batch_size, + **forwarded, + **self.hf_model_kwargs, + } + logger.info( + "Running lmms-eval (model_class=%s, pretrained=%s, forwarded_kwargs=%s)", + model_class, + init_kwargs["pretrained"], + sorted(set(init_kwargs) - {"pretrained", "batch_size"}), + ) + return lm_cls(**init_kwargs) + + def evaluate( + self, + model: "OliveModelHandler", + metrics: list[Metric], + device: Device = Device.CPU, + execution_providers: Optional[Union[str, list[str]]] = None, + ) -> MetricResult: + from lmms_eval.evaluator import simple_evaluate + + if isinstance(model, ONNXModelHandler): + lm = self._build_ortgenai_mm_lm(model, execution_providers) + elif isinstance(model, HfModelHandler): + lm = self._build_hf_lm(model, device) + else: + raise ValueError( + "LMMSEvaluator supports ONNXModelHandler (ORT-GenAI multimodal package) " + f"and HfModelHandler. Got {type(model).__name__}." + ) + + results = simple_evaluate( + model=lm, + tasks=self.tasks, + batch_size=self.batch_size, + limit=self.limit, + log_samples=self.log_samples, + ) + + if self.output_path: + import json as _json + + out = Path(self.output_path) + out.parent.mkdir(parents=True, exist_ok=True) + compact = { + "results": results.get("results", {}), + "configs": {k: str(v) for k, v in results.get("configs", {}).items()}, + } + out.write_text(_json.dumps(compact, indent=2, default=str), encoding="utf-8") + logger.info("Wrote lmms-eval results to %s", out) + + # Convert lmms-eval results into Olive's MetricResult shape (mirrors LMEvaluator) + metrics_dict: dict[str, MetricResult] = {} + for task_name in sorted(results.get("results", {}).keys()): + task_results = results["results"][task_name] + task_metrics = {} + for mf, v in sorted(task_results.items()): + if mf == "alias" or not isinstance(v, (int, float)): + continue + m, _, _ = mf.partition(",") + if m.endswith("_stderr"): + continue + task_metrics[m] = SubMetricResult(value=float(v), priority=-1, higher_is_better=True) + if task_metrics: + metrics_dict[task_name] = MetricResult.model_validate(task_metrics) + + return flatten_metric_result(metrics_dict) + + @Registry.register("MTEBEvaluator") class MTEBEvaluator(OliveEvaluator): """Evaluator for embedding models using the MTEB (Massive Text Embedding Benchmark) library. diff --git a/setup.py b/setup.py index b4aebf070a..798010301d 100644 --- a/setup.py +++ b/setup.py @@ -88,5 +88,8 @@ def get_extra_deps(rel_path): data_files=[], entry_points={ "console_scripts": ["olive=olive.cli.launcher:main"], + "lmms_eval.models": [ + "ortgenai_mm = olive.evaluator.lmms_ort:_model_manifest", + ], }, ) diff --git a/test/evaluator/test_lmms_ort.py b/test/evaluator/test_lmms_ort.py new file mode 100644 index 0000000000..408be890ad --- /dev/null +++ b/test/evaluator/test_lmms_ort.py @@ -0,0 +1,837 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +# Tests intentionally exercise "protected" runtime methods (_score_continuation, +# _run_generation) and configure fake collaborators by setting attributes +# directly on the fake. Both are normal in unit tests, so suppress pylint's +# protected-access / attribute-defined-outside-init warnings for this file. +# pylint: disable=protected-access,attribute-defined-outside-init +import sys +from types import ModuleType, SimpleNamespace +from typing import ClassVar +from unittest.mock import MagicMock, patch + +import numpy as np +import PIL.Image +import pytest + +from olive.evaluator.lmms_ort import ( + LMMSORTGenAIEvaluator, + _build_prompt, + _model_manifest, + _normalize_execution_provider, +) +from olive.evaluator.olive_evaluator import LMMSEvaluator +from olive.model import ONNXModelHandler + + +def test_build_prompt_uses_default_phi4mm_tokens(): + prompt = _build_prompt("phi4mm", 1, 1, "What happened?", "System prompt.") + + assert prompt == "<|system|>System prompt.<|end|><|user|><|image_1|><|audio_1|>What happened?<|end|><|assistant|>" + + +def test_build_prompt_uses_custom_template_and_token_formats(): + prompt = _build_prompt( + "custom", + 2, + 1, + "Question", + "System", + prompt_template="{system_prompt}\n{image_tokens}{audio_tokens}\n{text}", + image_token_format="", + audio_token_format="", + ) + + assert prompt == "System\n\nQuestion" + + +@pytest.mark.parametrize( + ("execution_provider", "expected"), + [ + ("CUDAExecutionProvider", "cuda"), + ("CPUExecutionProvider", "cpu"), + ("DmlExecutionProvider", "dml"), + ("gpu", "cuda"), + (None, "follow_config"), + (("CUDAExecutionProvider", {"device_id": "0"}), "cuda"), + ], +) +def test_normalize_execution_provider(execution_provider, expected): + assert _normalize_execution_provider(execution_provider) == expected + + +def test_lmms_evaluator_converts_lmms_results(tmp_path): + model_dir = tmp_path / "model" + model_dir.mkdir() + model_path = model_dir / "text.onnx" + model_path.touch() + (model_dir / "genai_config.json").write_text('{"model": {"type": "phi4mm"}}', encoding="utf-8") + + output_path = tmp_path / "results.json" + evaluator = LMMSEvaluator( + tasks=["ai2d_lite"], + batch_size=1, + limit=2, + output_path=str(output_path), + fail_on_error=False, + prompt_template="{user_content}", + image_token_format="", + ) + model = ONNXModelHandler(model_path=str(model_path)) + + simple_evaluate_result = { + "results": { + "ai2d_lite": { + "alias": "AI2D Lite", + "exact_match,none": 0.5, + "exact_match_stderr,none": 0.1, + "samples": [{"ignored": True}], + } + }, + "configs": {"ai2d_lite": {"task": "ai2d_lite"}}, + } + + simple_evaluate_mock = MagicMock(return_value=simple_evaluate_result) + lmms_eval_module = ModuleType("lmms_eval") + lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator") + lmms_eval_evaluator_module.simple_evaluate = simple_evaluate_mock + + with ( + patch.dict( + sys.modules, + {"lmms_eval": lmms_eval_module, "lmms_eval.evaluator": lmms_eval_evaluator_module}, + ), + patch("olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator", return_value=SimpleNamespace()) as lm_mock, + ): + result = evaluator.evaluate(model, [], execution_providers=["CUDAExecutionProvider"]) + + lm_mock.assert_called_once_with( + pretrained=str(model_dir), + batch_size=1, + max_new_tokens=256, + max_length=32768, + system_prompt="You are a helpful AI assistant.", + execution_provider="CUDAExecutionProvider", + provider_options=None, + fail_on_error=False, + prompt_template="{user_content}", + image_token_format="", + audio_token_format="<|audio_{index}|>", + ) + simple_evaluate_mock.assert_called_once() + assert result.get_value("ai2d_lite", "exact_match") == 0.5 + assert output_path.exists() + + +def test_lmms_evaluator_requires_genai_config(tmp_path): + model_path = tmp_path / "text.onnx" + model_path.touch() + evaluator = LMMSEvaluator(tasks=["ai2d_lite"]) + model = ONNXModelHandler(model_path=str(model_path)) + + with pytest.raises(ValueError, match="requires an ORT-GenAI package"): + evaluator.evaluate(model, []) + + +# ----------------------------------------------------------------------------- +# HuggingFace dispatch +# ----------------------------------------------------------------------------- + + +def _make_hf_model_handler_stub(model_name_or_path: str, hf_model_type: str): + handler = MagicMock(name="HfModelHandler") + handler.model_name_or_path = model_name_or_path + handler.get_hf_model_type.return_value = hf_model_type + return handler + + +def _patch_isinstance_for_hf(handler_stub, monkeypatch): + """Force isinstance(handler_stub, HfModelHandler) to True for the test path. + + Avoids constructing a real HfModelHandler (which would require a real HF + model on disk) while still exercising the dispatch logic. + """ + import olive.evaluator.olive_evaluator as oe + + real_isinstance = isinstance + + def _isinstance(obj, cls): + if obj is handler_stub and cls is oe.HfModelHandler: + return True + if obj is handler_stub and cls is oe.ONNXModelHandler: + return False + return real_isinstance(obj, cls) + + monkeypatch.setattr(oe, "isinstance", _isinstance, raising=False) + oe.isinstance = _isinstance + + +class _FakePhi4Wrapper: + """Fake lmms-eval wrapper with a phi4_multimodal-style signature. + + Accepts dtype + trust_remote_code (mirrors lmms-eval's phi4_multimodal class). + """ + + last_kwargs: ClassVar[dict] = {} + + def __init__(self, pretrained, device="cuda", dtype="auto", batch_size=1, trust_remote_code=True, **kwargs): + type(self).last_kwargs = { + "pretrained": pretrained, + "device": device, + "dtype": dtype, + "batch_size": batch_size, + "trust_remote_code": trust_remote_code, + **kwargs, + } + + +class _FakeQwenWrapper: + """Fake lmms-eval wrapper with a qwen2_5_vl-style signature. + + Does NOT accept dtype or trust_remote_code (mirrors lmms-eval's qwen2_5_vl + class which asserts kwargs == {}). + """ + + last_kwargs: ClassVar[dict] = {} + + def __init__(self, pretrained, device="cuda", device_map="auto", batch_size=1, **kwargs): + if kwargs: + raise AssertionError(f"Unexpected kwargs: {kwargs}") + type(self).last_kwargs = { + "pretrained": pretrained, + "device": device, + "device_map": device_map, + "batch_size": batch_size, + } + + +class _FakeKwargsWrapper: + """Fake lmms-eval wrapper that absorbs ALL options via ``**kwargs``. + + Mirrors lmms-eval wrappers (and HF model wrappers more generally) that take + only the required ``pretrained`` argument by name and pass everything else + through ``**kwargs`` to the underlying HF transformers model. Used to verify + LMMSEvaluator forwards optional kwargs (dtype, trust_remote_code, device) to + such wrappers instead of silently dropping them because they aren't in + ``inspect.signature(...).parameters`` as named params. + """ + + last_kwargs: ClassVar[dict] = {} + + def __init__(self, pretrained, **kwargs): + type(self).last_kwargs = {"pretrained": pretrained, **kwargs} + + +def test_lmms_evaluator_auto_detects_hf_model_class_from_model_type(tmp_path, monkeypatch): + """When model_class is unset, auto-detect from HfModelHandler.get_hf_model_type().""" + handler_stub = _make_hf_model_handler_stub("/local/path/Phi-4-multimodal-instruct", "phi4mm") + _patch_isinstance_for_hf(handler_stub, monkeypatch) + + output_path = tmp_path / "results.json" + evaluator = LMMSEvaluator(tasks=["ai2d_lite"], batch_size=2, limit=4, output_path=str(output_path)) + + simple_evaluate_mock = MagicMock( + return_value={"results": {"ai2d_lite": {"alias": "AI2D", "exact_match,none": 0.75}}, "configs": {}} + ) + lmms_eval_module = ModuleType("lmms_eval") + lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator") + lmms_eval_evaluator_module.simple_evaluate = simple_evaluate_mock + lmms_eval_models_module = ModuleType("lmms_eval.models") + lmms_eval_models_module.get_model = MagicMock(return_value=_FakePhi4Wrapper) + + with patch.dict( + sys.modules, + { + "lmms_eval": lmms_eval_module, + "lmms_eval.evaluator": lmms_eval_evaluator_module, + "lmms_eval.models": lmms_eval_models_module, + }, + ): + result = evaluator.evaluate(handler_stub, []) + + lmms_eval_models_module.get_model.assert_called_once_with("phi4_multimodal") + assert _FakePhi4Wrapper.last_kwargs["pretrained"] == "/local/path/Phi-4-multimodal-instruct" + assert _FakePhi4Wrapper.last_kwargs["batch_size"] == 2 + # trust_remote_code defaults to False (see olive/evaluator/olive_evaluator.py + # LMMSEvaluator.__init__); users opt in explicitly in the recipe. + assert _FakePhi4Wrapper.last_kwargs["trust_remote_code"] is False + assert _FakePhi4Wrapper.last_kwargs["dtype"] == "auto" + simple_evaluate_mock.assert_called_once() + assert result.get_value("ai2d_lite", "exact_match") == 0.75 + + +def test_lmms_evaluator_filters_kwargs_for_qwen_style_wrapper(monkeypatch): + """Wrappers like qwen2_5_vl reject unknown kwargs. + + LMMSEvaluator must inspect the wrapper signature and only forward kwargs + the wrapper actually declares as named parameters. + """ + handler_stub = _make_hf_model_handler_stub("/p/Qwen2.5-VL-3B-Instruct", "qwen2_5_vl") + _patch_isinstance_for_hf(handler_stub, monkeypatch) + + evaluator = LMMSEvaluator(tasks=["mmstar"], batch_size=1, dtype="bfloat16", trust_remote_code=True) + + simple_evaluate_mock = MagicMock(return_value={"results": {}, "configs": {}}) + lmms_eval_module = ModuleType("lmms_eval") + lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator") + lmms_eval_evaluator_module.simple_evaluate = simple_evaluate_mock + lmms_eval_models_module = ModuleType("lmms_eval.models") + lmms_eval_models_module.get_model = MagicMock(return_value=_FakeQwenWrapper) + + with patch.dict( + sys.modules, + { + "lmms_eval": lmms_eval_module, + "lmms_eval.evaluator": lmms_eval_evaluator_module, + "lmms_eval.models": lmms_eval_models_module, + }, + ): + evaluator.evaluate(handler_stub, []) + + # dtype + trust_remote_code must NOT have been forwarded (Qwen wrapper would error) + assert "dtype" not in _FakeQwenWrapper.last_kwargs + assert "trust_remote_code" not in _FakeQwenWrapper.last_kwargs + # but pretrained, device, batch_size MUST have been forwarded + assert _FakeQwenWrapper.last_kwargs["pretrained"] == "/p/Qwen2.5-VL-3B-Instruct" + assert _FakeQwenWrapper.last_kwargs["device"] == "cpu" # Device.CPU default + assert _FakeQwenWrapper.last_kwargs["batch_size"] == 1 + + +def test_lmms_evaluator_does_not_forward_to_pure_var_keyword_wrappers(monkeypatch): + """Verify ``device``/``dtype``/``trust_remote_code`` are NOT auto-forwarded to ``**kwargs`` wrappers. + + Rationale: the signature alone cannot tell "absorbs unknowns" from "rejects + unknowns at runtime" (qwen2_5_vl has ``**kwargs`` and asserts ``kwargs == + {}``). To stay safe, only kwargs named explicitly as parameters are + auto-forwarded. Users who want to forward additional kwargs to a pure + ``**kwargs`` wrapper must use ``hf_model_kwargs`` (explicit user opt-in). + """ + handler_stub = _make_hf_model_handler_stub("/p/some-vlm", "qwen2_5_vl") + _patch_isinstance_for_hf(handler_stub, monkeypatch) + + evaluator = LMMSEvaluator( + tasks=["mmstar"], + batch_size=1, + dtype="bfloat16", + trust_remote_code=True, + # The escape hatch for forwarding arbitrary kwargs to a wrapper that + # absorbs them via **kwargs: + hf_model_kwargs={"custom_backend_opt": "value"}, + ) + + simple_evaluate_mock = MagicMock(return_value={"results": {}, "configs": {}}) + lmms_eval_module = ModuleType("lmms_eval") + lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator") + lmms_eval_evaluator_module.simple_evaluate = simple_evaluate_mock + lmms_eval_models_module = ModuleType("lmms_eval.models") + lmms_eval_models_module.get_model = MagicMock(return_value=_FakeKwargsWrapper) + + with patch.dict( + sys.modules, + { + "lmms_eval": lmms_eval_module, + "lmms_eval.evaluator": lmms_eval_evaluator_module, + "lmms_eval.models": lmms_eval_models_module, + }, + ): + evaluator.evaluate(handler_stub, []) + + # Required kwargs are always forwarded. + assert _FakeKwargsWrapper.last_kwargs["pretrained"] == "/p/some-vlm" + assert _FakeKwargsWrapper.last_kwargs["batch_size"] == 1 + # Optional kwargs are NOT forwarded to pure **kwargs wrappers. + assert "dtype" not in _FakeKwargsWrapper.last_kwargs + assert "trust_remote_code" not in _FakeKwargsWrapper.last_kwargs + assert "device" not in _FakeKwargsWrapper.last_kwargs + # The explicit hf_model_kwargs escape hatch IS forwarded. + assert _FakeKwargsWrapper.last_kwargs["custom_backend_opt"] == "value" + + +def test_lmms_evaluator_uses_explicit_model_class_when_set(monkeypatch): + """An explicit ``model_class`` in the recipe overrides auto-detection.""" + handler_stub = _make_hf_model_handler_stub("/p/some-vlm", "some-unknown-vlm-type") + _patch_isinstance_for_hf(handler_stub, monkeypatch) + + evaluator = LMMSEvaluator(tasks=["mmstar"], model_class="qwen2_5_vl", batch_size=1) + + simple_evaluate_mock = MagicMock(return_value={"results": {}, "configs": {}}) + lmms_eval_module = ModuleType("lmms_eval") + lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator") + lmms_eval_evaluator_module.simple_evaluate = simple_evaluate_mock + lmms_eval_models_module = ModuleType("lmms_eval.models") + lmms_eval_models_module.get_model = MagicMock(return_value=_FakeQwenWrapper) + + with patch.dict( + sys.modules, + { + "lmms_eval": lmms_eval_module, + "lmms_eval.evaluator": lmms_eval_evaluator_module, + "lmms_eval.models": lmms_eval_models_module, + }, + ): + evaluator.evaluate(handler_stub, []) + + lmms_eval_models_module.get_model.assert_called_once_with("qwen2_5_vl") + handler_stub.get_hf_model_type.assert_not_called() + + +def test_lmms_evaluator_raises_when_hf_model_type_is_unmapped(monkeypatch): + """If we can't auto-detect and the user didn't set model_class, fail loudly.""" + handler_stub = _make_hf_model_handler_stub("/p/exotic-model", "some-exotic-vlm") + _patch_isinstance_for_hf(handler_stub, monkeypatch) + + evaluator = LMMSEvaluator(tasks=["mmstar"]) + + # Even with lmms_eval modules mocked, the error fires before reaching them. + lmms_eval_module = ModuleType("lmms_eval") + lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator") + lmms_eval_evaluator_module.simple_evaluate = MagicMock() + lmms_eval_models_module = ModuleType("lmms_eval.models") + lmms_eval_models_module.get_model = MagicMock() + with ( + patch.dict( + sys.modules, + { + "lmms_eval": lmms_eval_module, + "lmms_eval.evaluator": lmms_eval_evaluator_module, + "lmms_eval.models": lmms_eval_models_module, + }, + ), + pytest.raises(ValueError, match=r"Could not auto-detect lmms-eval model_class"), + ): + evaluator.evaluate(handler_stub, []) + + +def test_lmms_evaluator_rejects_unsupported_handler_type(): + """LMMSEvaluator only supports HfModelHandler and ONNXModelHandler-as-ortgenai.""" + evaluator = LMMSEvaluator(tasks=["mmstar"]) + bogus = SimpleNamespace() # neither HfModelHandler nor ONNXModelHandler + + lmms_eval_module = ModuleType("lmms_eval") + lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator") + lmms_eval_evaluator_module.simple_evaluate = MagicMock() + with ( + patch.dict(sys.modules, {"lmms_eval": lmms_eval_module, "lmms_eval.evaluator": lmms_eval_evaluator_module}), + pytest.raises(ValueError, match=r"ONNXModelHandler.*HfModelHandler"), + ): + evaluator.evaluate(bogus, []) + + +# ----------------------------------------------------------------------------- +# lmms-eval MODEL_REGISTRY_V2 entry-point integration +# ----------------------------------------------------------------------------- + + +def test_lmms_ort_genai_evaluator_is_simple_flag_matches_registration(): + """Verify is_simple matches the lmms-eval registration type. + + MODEL_REGISTRY_V2._validate_model_class requires the class' ``is_simple`` + flag to match the registered model_type (``simple`` vs ``chat``). Our + adapter is registered with ``simple_class_path``, so ``is_simple`` must + be ``True``. + """ + assert LMMSORTGenAIEvaluator.is_simple is True + + +def test_model_manifest_factory_returns_expected_manifest(): + """Verify the entry-point payload points at LMMSORTGenAIEvaluator.""" + pytest.importorskip("lmms_eval") + + manifest = _model_manifest() + + assert manifest.model_id == "ortgenai_mm" + assert manifest.simple_class_path == "olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator" + assert manifest.chat_class_path is None + + +def test_olive_ai_registers_ortgenai_mm_entry_point(): + """Verify olive-ai exposes ortgenai_mm via the lmms_eval.models entry-point group.""" + from importlib.metadata import entry_points + + eps = {ep.name: ep.value for ep in entry_points(group="lmms_eval.models")} + assert eps.get("ortgenai_mm") == "olive.evaluator.lmms_ort:_model_manifest" + + +def test_model_registry_v2_resolves_ortgenai_mm(): + """Verify lmms-eval's MODEL_REGISTRY_V2 resolves ortgenai_mm via the entry point.""" + pytest.importorskip("lmms_eval") + from lmms_eval.models import MODEL_REGISTRY_V2 + + resolved = MODEL_REGISTRY_V2.resolve("ortgenai_mm") + assert resolved.model_id == "ortgenai_mm" + assert resolved.model_type == "simple" + assert resolved.class_path == "olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator" + + cls = MODEL_REGISTRY_V2.get_model_class("ortgenai_mm") + assert cls is LMMSORTGenAIEvaluator + + +# ----------------------------------------------------------------------------- +# Visual partitioning +# ----------------------------------------------------------------------------- + + +def test_partition_visuals_separates_images_and_audios_and_skips_nones(): + from olive.evaluator.lmms_ort import _partition_visuals + + img1 = PIL.Image.new("RGB", (4, 4)) + img2_dict = {"bytes": _png_bytes(PIL.Image.new("RGB", (2, 2)))} + audio_dict = {"array": np.zeros(16, dtype=np.float32), "sampling_rate": 16000} + + images, audios = _partition_visuals([img1, None, img2_dict, audio_dict, None]) + + assert len(images) == 2 + assert all(isinstance(img, PIL.Image.Image) for img in images) + assert len(audios) == 1 + arr, sr = audios[0] + assert arr.shape == (16,) + assert sr == 16000 + + +def test_partition_visuals_handles_none_input(): + from olive.evaluator.lmms_ort import _partition_visuals + + assert _partition_visuals(None) == ([], []) + assert _partition_visuals([]) == ([], []) + + +def _png_bytes(image): + import io as _io + + buf = _io.BytesIO() + image.save(buf, format="PNG") + return buf.getvalue() + + +# ----------------------------------------------------------------------------- +# Runtime: _score_continuation and _run_generation +# +# These tests construct a LMMSORTGenAIEvaluator with the onnxruntime_genai +# module wholesale-mocked, so the generation/scoring flows are exercised +# end-to-end without needing an actual ORT-GenAI model on disk. +# ----------------------------------------------------------------------------- + + +class _FakeTokenStream: + def decode(self, tok): + return f"" + + +class _FakeTokenizer: + """Minimal stub for og.Tokenizer used by _score_continuation/_run_generation. + + ``tokens_for`` is a dict mapping the exact input string to the list of + token ids encode() should return; this lets each test inject specific + prompt + (prompt+continuation) tokenizations to drive the slicing logic. + """ + + def __init__(self, tokens_for, eos_token_ids=(99,)): + self._tokens_for = tokens_for + self.eos_token_ids = list(eos_token_ids) + + def encode(self, text): + if text not in self._tokens_for: + raise KeyError(f"_FakeTokenizer: no canned tokenization for text={text!r}") + return list(self._tokens_for[text]) + + def create_stream(self): + return _FakeTokenStream() + + +class _FakeGenerator: + """Records call order so tests can assert the score/generation protocol. + + ``logits_queue`` is a list of numpy arrays consumed in order, one per forward + pass: ``generate_next_token`` and ``append_tokens`` each consume one entry + into ``_current_logits``, which ``get_logits()`` returns. + """ + + instances: ClassVar[list] = [] + + def __init__(self, model, params): + type(self).instances.append(self) + self._model = model + self._params = params + self._logits_queue = list(model._next_logits_queue) + self._sampled_queue = list(model._next_sampled_queue) + self._current_logits = None + self._token_count = 0 + self._done = False + self._last_sampled = -1 + self.calls = [] + + def _consume_forward_pass(self): + if not self._logits_queue: + raise RuntimeError("_FakeGenerator: forward pass exhausted (logits_queue empty)") + self._current_logits = self._logits_queue.pop(0) + + def set_inputs(self, inputs): + # set_inputs only loads inputs; it does NOT trigger a forward pass and + # therefore does NOT populate _current_logits. This is the behavior the + # production code at lmms_ort.py:_score_continuation has to compensate + # for by calling generate_next_token() to force the prompt-fill compute. + self.calls.append(("set_inputs", inputs)) + + def generate_next_token(self): + self._consume_forward_pass() + sampled = self._sampled_queue.pop(0) if self._sampled_queue else -1 + self._last_sampled = sampled + self._token_count += 1 + self.calls.append(("generate_next_token", sampled)) + + def get_logits(self): + if self._current_logits is None: + raise RuntimeError("_FakeGenerator: get_logits called before any forward pass") + return np.asarray(self._current_logits, dtype=np.float32) + + def get_next_tokens(self): + return np.array([self._last_sampled], dtype=np.int32) + + def append_tokens(self, tok_array): + toks = [int(t) for t in np.asarray(tok_array).reshape(-1)] + self.calls.append(("append_tokens", toks)) + self._token_count += len(toks) + # Each appended token batch is one forward pass (computes new last-position logits). + self._consume_forward_pass() + + def token_count(self): + return self._token_count + + def is_done(self): + return self._done + + def rewind_to(self, n): + self.calls.append(("rewind_to", n)) + self._token_count = n + + +class _FakeGeneratorParams: + def __init__(self, model): + self._model = model + + def set_search_options(self, **kwargs): + pass + + +class _FakeProcessor: + def __call__(self, prompt, images=None, audios=None): + return SimpleNamespace(_prompt=prompt, _images=images, _audios=audios) + + +class _FakeOgModel: + def __init__(self, tokenizer=None, next_logits_queue=None, next_sampled_queue=None): + self.tokenizer = tokenizer + self._next_logits_queue = list(next_logits_queue or []) + self._next_sampled_queue = list(next_sampled_queue or []) + + def create_multimodal_processor(self): + return _FakeProcessor() + + +def _make_fake_og(model): + """Build a SimpleNamespace mimicking the onnxruntime_genai module surface. + + Covers the API surface used by LMMSORTGenAIEvaluator's __init__ + runtime paths. + """ + return SimpleNamespace( + Model=lambda *a, **kw: model, + Tokenizer=lambda m: model.tokenizer, + Generator=_FakeGenerator, + GeneratorParams=_FakeGeneratorParams, + Images=SimpleNamespace(open=lambda *paths: ("IMG", list(paths))), + Audios=SimpleNamespace(open=lambda *paths: ("AUDIO", list(paths))), + Config=lambda *a, **kw: SimpleNamespace( + clear_providers=lambda: None, + append_provider=lambda *a, **kw: None, + set_provider_option=lambda *a, **kw: None, + ), + ) + + +def _build_lmms_ortgenai_evaluator(tmp_path, fake_model): + """Construct an LMMSORTGenAIEvaluator wired to a fake onnxruntime_genai. + + Returns ``(evaluator, og_patcher)``. The patcher is a context manager that + swaps in the fake ``og`` module; tests should call evaluator methods inside + ``with og_patcher: ...`` so runtime paths (``_score_continuation``, + ``_run_generation``) also use the fake instead of the real ORT-GenAI. + """ + from olive.evaluator import lmms_ort as lmms_ort_mod + + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "genai_config.json").write_text('{"model": {"type": "phi4mm"}}', encoding="utf-8") + + fake_og = _make_fake_og(fake_model) + _FakeGenerator.instances = [] + og_patcher = patch.object(lmms_ort_mod, "og", fake_og) + with og_patcher: + evaluator = LMMSORTGenAIEvaluator( + pretrained=str(model_dir), + batch_size=1, + max_new_tokens=8, + max_length=64, + execution_provider="cpu", + fail_on_error=True, + ) + # Return a fresh patcher for the test to use during runtime calls. + return evaluator, patch.object(lmms_ort_mod, "og", fake_og) + + +def test_score_continuation_uses_joint_tokenization_to_slice_continuation(tmp_path): + """Encoding ``continuation`` standalone is wrong for sentencepiece/BPE tokenizers. + + The adapter must encode ``prompt + continuation`` jointly and slice off the + prompt-aligned prefix so the scored tokens are the ones the model would + actually produce extending the prompt. Verify by giving the tokenizer + DIFFERENT tokens for ``continuation`` vs the prompt-suffix of + ``prompt + continuation``: only the latter should land in append_tokens. + """ + prompt = "<|user|>What is in the image?<|end|><|assistant|>" + continuation = "A" + + fake_tokenizer = _FakeTokenizer( + { + prompt: [1, 2, 3, 4], + prompt + continuation: [1, 2, 3, 4, 17, 18], # cont tokens = [17, 18] + continuation: [99999], # standalone-encoded - MUST NOT be used + } + ) + + # Three forward passes: one prompt-fill (generate_next_token) + two cont tokens. + vocab_size = 50 + logits_prompt_end = np.full(vocab_size, -10.0, dtype=np.float32) + logits_prompt_end[17] = 5.0 # greedy = 17 + logits_after_17 = np.full(vocab_size, -10.0, dtype=np.float32) + logits_after_17[18] = 5.0 # greedy = 18 + logits_after_18 = np.full(vocab_size, -10.0, dtype=np.float32) + + fake_model = _FakeOgModel( + tokenizer=fake_tokenizer, + next_logits_queue=[logits_prompt_end, logits_after_17, logits_after_18], + next_sampled_queue=[42], + ) + + evaluator, og_patcher = _build_lmms_ortgenai_evaluator(tmp_path, fake_model) + + with og_patcher: + logprob, is_greedy = evaluator._score_continuation(prompt, continuation, images=[], audios=[]) + + gen = _FakeGenerator.instances[-1] + set_inputs_idx = next(i for i, c in enumerate(gen.calls) if c[0] == "set_inputs") + next_token_idx = next(i for i, c in enumerate(gen.calls) if c[0] == "generate_next_token") + append_calls = [c for c in gen.calls if c[0] == "append_tokens"] + + # set_inputs runs BEFORE the prompt-fill forward pass. + assert set_inputs_idx < next_token_idx + # The throwaway sample is rewound after the first iteration, before the first + # real continuation token is appended. + rewind_calls = [c for c in gen.calls if c[0] == "rewind_to"] + assert len(rewind_calls) == 1 + # cont_tokens were correctly sliced from prompt+continuation, NOT taken from + # encode(continuation) standalone (which would have been [99999]). + assert [tok for _, tok in append_calls] == [[17], [18]] + # Both predicted tokens were greedy (== argmax of their position's logits). + assert is_greedy is True + assert logprob < 0.0 # softmax(logits)[tok] is a probability in (0, 1) -> log negative + + +def test_score_continuation_triggers_forward_pass_before_first_get_logits(tmp_path): + """Trigger compute after ``set_inputs`` before reading ``get_logits()``. + + ``set_inputs()`` does not run the decoder forward pass. The adapter must + explicitly trigger it (``generate_next_token``) before the first + ``get_logits()`` call, or that read returns undefined data. + """ + prompt = "<|user|>x<|end|><|assistant|>" + continuation = "y" + fake_tokenizer = _FakeTokenizer({prompt: [1, 2], prompt + continuation: [1, 2, 5], continuation: [777]}) + logits = np.zeros(10, dtype=np.float32) + logits[5] = 1.0 + fake_model = _FakeOgModel( + tokenizer=fake_tokenizer, + next_logits_queue=[logits, np.zeros(10, dtype=np.float32)], + next_sampled_queue=[0], + ) + + evaluator, og_patcher = _build_lmms_ortgenai_evaluator(tmp_path, fake_model) + with og_patcher: + evaluator._score_continuation(prompt, continuation, images=[], audios=[]) + + gen = _FakeGenerator.instances[-1] + # The order must be: set_inputs, generate_next_token (prompt-fill compute), + # then the loop's append_tokens. get_logits is read implicitly between + # generate_next_token and the first append_tokens. + op_names = [c[0] for c in gen.calls] + set_inputs_idx = op_names.index("set_inputs") + next_token_idx = op_names.index("generate_next_token") + first_append_idx = op_names.index("append_tokens") + assert set_inputs_idx < next_token_idx < first_append_idx + + +def test_score_continuation_returns_zero_when_continuation_tokenizes_to_empty_suffix(tmp_path): + """Short-circuit cleanly when continuation contributes no tokens. + + Happens when prompt+cont tokenizes to the same length as prompt (e.g. cont + is just whitespace absorbed by tokenizer normalization). The adapter must + short-circuit instead of feeding an empty cont_tokens list to the loop. + """ + prompt = "<|user|>x<|end|><|assistant|>" + continuation = "" + fake_tokenizer = _FakeTokenizer({prompt: [1, 2, 3], prompt + continuation: [1, 2, 3]}) + fake_model = _FakeOgModel(tokenizer=fake_tokenizer) + + evaluator, og_patcher = _build_lmms_ortgenai_evaluator(tmp_path, fake_model) + with og_patcher: + logprob, is_greedy = evaluator._score_continuation(prompt, continuation, images=[], audios=[]) + + assert (logprob, is_greedy) == (0.0, True) + # No generator should have been constructed for an empty continuation. + assert not _FakeGenerator.instances + + +def test_run_generation_stops_on_eos_token(tmp_path): + """Verify ``_run_generation`` stops at EOS tokens. + + ``_run_generation`` must respect ``_eos_token_ids`` and stop emitting text + once the model samples an EOS token, even before max_new_tokens is reached. + """ + fake_tokenizer = _FakeTokenizer({}, eos_token_ids=[99]) + logits = np.zeros(100, dtype=np.float32) + fake_model = _FakeOgModel( + tokenizer=fake_tokenizer, + next_logits_queue=[logits] * 5, + next_sampled_queue=[7, 8, 99, 10, 11], # third sample is EOS + ) + + evaluator, og_patcher = _build_lmms_ortgenai_evaluator(tmp_path, fake_model) + with og_patcher: + out = evaluator._run_generation("p", images=[], audios=[], max_new_tokens=5) + + # Stream produces "" per non-EOS token; EOS stops generation. + assert out == "" + gen = _FakeGenerator.instances[-1] + op_names = [c[0] for c in gen.calls] + # Exactly 3 generate_next_token calls happened (7, 8, then EOS stops loop). + assert op_names.count("generate_next_token") == 3 + + +def test_run_generation_stops_on_explicit_stop_string(tmp_path): + """``stop_strings`` should truncate output as soon as a stop sequence appears.""" + fake_tokenizer = _FakeTokenizer({}, eos_token_ids=[]) + logits = np.zeros(10, dtype=np.float32) + fake_model = _FakeOgModel( + tokenizer=fake_tokenizer, + next_logits_queue=[logits] * 4, + next_sampled_queue=[1, 2, 3, 4], + ) + + evaluator, og_patcher = _build_lmms_ortgenai_evaluator(tmp_path, fake_model) + # _FakeTokenStream emits "..."; "" appears after the 2nd token. + with og_patcher: + out = evaluator._run_generation("p", images=[], audios=[], max_new_tokens=10, stop_strings=[""]) + + # Output is truncated at (but not including) the stop string. + assert out == "" From ba5d1604c9c8123ca869ee11560eb9d30f8406b6 Mon Sep 17 00:00:00 2001 From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com> Date: Mon, 15 Jun 2026 20:09:01 +0000 Subject: [PATCH 2/6] evaluator: extend LMMSEvaluator HF dispatch + add CompositeToOnnxPackage Build on top of the LMMSEvaluator + ORT-GenAI multimodal adapter foundation: - LMMSEvaluator now dispatches HfModelHandler inputs to lmms-eval's native per-architecture wrappers (phi4_multimodal, qwen2_5_vl, whisper, ...), with auto-detection from HF model_type and a forwarded-kwargs filter that only passes args the target wrapper actually declares (handles wrappers like qwen2_5_vl which assert kwargs == {}). Enables FP-vs-quantized comparison in a single recipe via evaluate_input_model. - lmms_ort.py adapter: tolerant audio/image disambiguation (audio dicts with "path" no longer get mis-routed to PIL.Image.open), Whisper-specific prompt + EOS-collision handling so ASR works end-to-end through ortgenai_mm without the Phi-4-MM chat-template scaffolding interfering. - New CompositeToOnnxPackage pass: flattens nested CompositeModel ORT-GenAI packages (subdir-per-component or root-level) into the flat layout LMMSEvaluator expects. Tolerates extensionless component filenames produced by some upstream quant passes. - Tests: 32 in test_lmms_ort.py (entry-point/registry, HF dispatch, kwargs filter, prompt builder, score_continuation, partition_visuals, run_generation), 9 in test_composite_to_onnx_package.py (flatten + external-data rewrites + fallback entry-point). Validated end-to-end: - whisper-large-v3 via HfModel -> ModelBuilder fp16 -> KQuant int8 -> CompositeToOnnxPackage -> ortgenai_mm eval on LibriSpeech. FP HF WER 1.52/2.26 (clean/other), INT8 ONNX WER 1.68/2.36. --- olive/evaluator/lmms_ort.py | 172 ++++++--- olive/evaluator/olive_evaluator.py | 7 + olive/olive_config.json | 8 + .../passes/onnx/composite_to_onnx_package.py | 309 +++++++++++++++++ test/evaluator/test_lmms_ort.py | 84 +++++ .../onnx/test_composite_to_onnx_package.py | 325 ++++++++++++++++++ 6 files changed, 863 insertions(+), 42 deletions(-) create mode 100644 olive/passes/onnx/composite_to_onnx_package.py create mode 100644 test/passes/onnx/test_composite_to_onnx_package.py diff --git a/olive/evaluator/lmms_ort.py b/olive/evaluator/lmms_ort.py index 63fc67fdb8..635f20cf70 100644 --- a/olive/evaluator/lmms_ort.py +++ b/olive/evaluator/lmms_ort.py @@ -75,52 +75,71 @@ def decorator(cls): # ----------------------------------------------------------------------------- +_IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif"} +_AUDIO_SUFFIXES = {".wav", ".mp3", ".flac", ".ogg", ".m4a"} + + def _normalize_image(visual) -> PIL.Image.Image | None: if isinstance(visual, PIL.Image.Image): return visual.convert("RGB") if isinstance(visual, (str, Path)): p = Path(visual) - if p.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif"}: + if p.suffix.lower() in _IMAGE_SUFFIXES: return PIL.Image.open(p).convert("RGB") return None if isinstance(visual, dict): + # Audio dicts typically include "sampling_rate" or "array"; skip those. + if "sampling_rate" in visual or "array" in visual: + return None if "bytes" in visual: return PIL.Image.open(io.BytesIO(visual["bytes"])).convert("RGB") if "path" in visual: - return PIL.Image.open(visual["path"]).convert("RGB") + p = Path(visual["path"]) if visual["path"] else None + if p is not None and p.suffix.lower() in _IMAGE_SUFFIXES: + return PIL.Image.open(p).convert("RGB") + return None if isinstance(visual, np.ndarray): return PIL.Image.fromarray(np.uint8(visual)).convert("RGB") return None def _normalize_audio(visual) -> tuple[np.ndarray, int] | None: - if isinstance(visual, dict) and "array" in visual and "sampling_rate" in visual: - return np.asarray(visual["array"], dtype=np.float32), int(visual["sampling_rate"]) + if isinstance(visual, dict): + if "array" in visual and "sampling_rate" in visual: + return np.asarray(visual["array"], dtype=np.float32), int(visual["sampling_rate"]) + if visual.get("path"): + return _load_audio_file(Path(visual["path"])) if isinstance(visual, (str, Path)): - p = Path(visual) - if p.suffix.lower() in {".wav", ".mp3", ".flac", ".ogg", ".m4a"}: - try: - import librosa - except ImportError: - logger.warning("Audio file %s encountered but librosa not installed.", p) - return None - arr, sr = librosa.load(str(p), sr=None, mono=True) - return arr.astype(np.float32), int(sr) + return _load_audio_file(Path(visual)) return None +def _load_audio_file(p: Path) -> tuple[np.ndarray, int] | None: + if p.suffix.lower() not in _AUDIO_SUFFIXES or not p.exists(): + return None + try: + import librosa + except ImportError: + logger.warning("Audio file %s encountered but librosa not installed.", p) + return None + arr, sr = librosa.load(str(p), sr=None, mono=True) + return arr.astype(np.float32), int(sr) + + def _partition_visuals(visuals): images, audios = [], [] for v in visuals or []: if v is None: continue - img = _normalize_image(v) - if img is not None: - images.append(img) - continue + # Try audio first since its signature ("array"+"sampling_rate") is more + # distinctive than the image path/bytes/PIL signatures. au = _normalize_audio(v) if au is not None: audios.append(au) + continue + img = _normalize_image(v) + if img is not None: + images.append(img) return images, audios @@ -138,12 +157,23 @@ def _build_prompt( image_token_format: str = "<|image_{index}|>", audio_token_format: str = "<|audio_{index}|>", ) -> str: - """Build a Phi-4-multimodal-style chat prompt. + """Build a chat-style prompt for the model. + + Defaults to Phi-4-multimodal's chat template. For Whisper (pure ASR; no + text prompt, no chat template), returns an empty string — the ORT-GenAI + multimodal processor builds the decoder start tokens from the audio input + plus genai_config defaults. Other multimodal architectures use different placeholder tags. Users can - override the media token formats and the full prompt template from the Olive - evaluator config without changing this adapter. + override the media token formats and the full prompt template from the + Olive evaluator config without changing this adapter. """ + if model_type == "whisper": + # Whisper has no chat template; the "prompt" is just the decoder-start + # token sequence that conditions the model on language + task. This + # matches ORT-GenAI's benchmark_multimodal.py reference. + # Source: microsoft/onnxruntime-genai benchmark/python/benchmark_multimodal.py + return "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" image_tokens = "".join(_format_media_tokens(num_images, image_token_format)) audio_tokens = "".join(_format_media_tokens(num_audios, audio_token_format)) parts = [image_tokens, audio_tokens, user_text] @@ -258,6 +288,18 @@ def __init__( self._tokenizer = og.Tokenizer(self._model) self._processor = self._model.create_multimodal_processor() + # Default prompt-builder path: og.Tokenizer.apply_chat_template (matches + # PR #2488's OnnxEvaluator._inference_vision_genai and the olive-recipes + # eval scripts for Qwen2.5-VL, Qwen3-VL, and google-gemma-4). Older + # onnxruntime-genai versions don't expose this method, in which case we + # fall back to the legacy format-string path (_build_prompt below). + self._has_chat_template = hasattr(self._tokenizer, "apply_chat_template") + if not self._has_chat_template: + logger.warning( + "ORT-GenAI tokenizer does not expose apply_chat_template; falling back to " + "legacy format-string prompt building. Consider upgrading onnxruntime-genai." + ) + eos_ids = self._tokenizer.eos_token_ids self._eos_token_ids = {int(t) for t in (eos_ids if eos_ids is not None else [])} @@ -338,7 +380,10 @@ def _run_generation( og_audios = self._build_og_audios(audios, tmp_dir) try: - inputs = self._processor(prompt, images=og_images, audios=og_audios) + # ORT-GenAI processors accept either a bare string or a list of + # strings depending on backend; benchmark_multimodal.py wraps in + # a list, which matches both the whisper and phi4mm paths. + inputs = self._processor([prompt], images=og_images, audios=og_audios) except Exception as e: # pragma: no cover del generator return self._handle_error("ORT-GenAI multimodal processor failed.", e, "") @@ -351,14 +396,23 @@ def _run_generation( "ORT-GenAI generator input setup failed. The prompt may exceed max_length.", e, "" ) + # Whisper's BOS == EOS (token 50257 = <|startoftranscript|> = <|endoftext|>), + # so the very first generated token can collide with EOS. Skip the + # EOS check until we've emitted at least one non-EOS token. decoded = "" stream = self._tokenizer.create_stream() steps = 0 + generated_any = False while not generator.is_done() and steps < max_new_tokens: generator.generate_next_token() tok = int(generator.get_next_tokens()[0]) if tok in self._eos_token_ids: - break + if generated_any: + break + # First-step EOS collision with BOS; skip and keep generating. + steps += 1 + continue + generated_any = True decoded += stream.decode(tok) if stop_strings: for s in stop_strings: @@ -441,6 +495,58 @@ def _score_continuation(self, prompt: str, continuation: str, images, audios) -> del generator return total_logprob, all_greedy + def _build_prompt_for_request(self, user_text: str, num_images: int, num_audios: int) -> str: + """Build the final prompt string fed to ``og.MultiModalProcessor``. + + Default path: pre-render image/audio markers into the user content + string using ``image_token_format`` / ``audio_token_format``, then call + ``og.Tokenizer.apply_chat_template`` to add the model-specific chat + scaffolding (system/user/assistant turn markers). + + Pure content-parts (``{"type": "image"}``) is what PR #2488 and the + olive-recipes Qwen2.5-VL eval scripts do, and it works for chat + templates that understand structured content (Qwen2.5-VL, Qwen3-VL, + Gemma-4). However, Phi-4-MM's chat template stringifies content lists + as Python repr (verified: produces + ``<|user|>[{'type': 'image'}, ...]<|end|>`` instead of injecting + ``<|image_1|>``). Pre-rendering the markers ourselves before + ``apply_chat_template`` works for both conventions, since templates + that just pass through user content render identically either way. + + Fallback path: ``_build_prompt`` legacy format-string. Used when the + user has explicitly set ``prompt_template`` in the evaluator config + (to override per-benchmark) or when the underlying onnxruntime-genai + version predates ``apply_chat_template`` on ``og.Tokenizer``. + """ + if self._model_type == "whisper": + # Whisper has no chat template; the "prompt" is just the decoder-start + # token sequence that conditions on language + task. user_text from + # lmms-eval tasks (e.g. "Please recognize the speech...") is ignored. + return _build_prompt(self._model_type, num_images, num_audios, user_text) + + if self.prompt_template or not self._has_chat_template: + return _build_prompt( + self._model_type, + num_images, + num_audios, + user_text, + self.system_prompt, + self.prompt_template, + self.image_token_format, + self.audio_token_format, + ) + + image_markers = "".join(_format_media_tokens(num_images, self.image_token_format)) + audio_markers = "".join(_format_media_tokens(num_audios, self.audio_token_format)) + user_content = f"{image_markers}{audio_markers}{user_text}" + + messages: list[dict[str, Any]] = [] + if self.system_prompt: + messages.append({"role": "system", "content": self.system_prompt}) + messages.append({"role": "user", "content": user_content}) + + return self._tokenizer.apply_chat_template(json.dumps(messages), add_generation_prompt=True) + def _get_doc_and_visuals(self, doc_to_visual, doc_id, task, split): try: doc = self.task_dict[task][split][doc_id] @@ -473,16 +579,7 @@ def generate_until(self, requests: list[Instance], disable_tqdm: bool = False) - if isinstance(stop, str): stop = [stop] - prompt = _build_prompt( - self._model_type, - len(images), - len(audios), - contexts, - self.system_prompt, - self.prompt_template, - self.image_token_format, - self.audio_token_format, - ) + prompt = self._build_prompt_for_request(contexts, len(images), len(audios)) text = self._run_generation(prompt, images, audios, max_new, stop) results.append(text) self.cache_hook.add_partial("generate_until", (contexts, gen_kwargs), text) @@ -499,16 +596,7 @@ def loglikelihood(self, requests: list[Instance], disable_tqdm: bool = False) -> images, audios = _partition_visuals(visuals) continuation = str(doc_to_target(doc)) - prompt = _build_prompt( - self._model_type, - len(images), - len(audios), - contexts, - self.system_prompt, - self.prompt_template, - self.image_token_format, - self.audio_token_format, - ) + prompt = self._build_prompt_for_request(contexts, len(images), len(audios)) logprob, is_greedy = self._score_continuation(prompt, continuation, images, audios) results.append((logprob, is_greedy)) self.cache_hook.add_partial("loglikelihood", (contexts, continuation), (logprob, is_greedy)) diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index af73fabe64..ee7d63286e 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -2084,6 +2084,13 @@ def __init__(self, tasks: list[str], **kwargs): self.prompt_template = kwargs.get("prompt_template") self.image_token_format = kwargs.get("image_token_format", "<|image_{index}|>") self.audio_token_format = kwargs.get("audio_token_format", "<|audio_{index}|>") + # NOTE: ``prompt_template`` / ``image_token_format`` / ``audio_token_format`` + # are legacy format-string knobs and should rarely be needed. By default + # the ortgenai_mm adapter calls ``og.Tokenizer.apply_chat_template`` (same + # path used by PR #2488 and olive-recipes eval scripts), which reads the + # package's ``chat_template.jinja`` and produces the correct chat format + # for every supported model automatically. Setting ``prompt_template`` + # forces the adapter into the legacy hand-templated path. # HF-only knobs (forwarded to lmms-eval's native wrapper if present). # ``trust_remote_code`` defaults to False to match the rest of Olive # (e.g. olive/common/hf/utils.py, olive/data/component/load_dataset.py) diff --git a/olive/olive_config.json b/olive/olive_config.json index 1978e61dcb..bac23f921c 100644 --- a/olive/olive_config.json +++ b/olive/olive_config.json @@ -8,6 +8,14 @@ "supported_algorithms": [ ], "supported_quantization_encodings": [ ] }, + "CompositeToOnnxPackage": { + "module_path": "olive.passes.onnx.composite_to_onnx_package.CompositeToOnnxPackage", + "supported_providers": [ "*" ], + "supported_accelerators": [ "*" ], + "supported_precisions": [ "*" ], + "supported_algorithms": [ ], + "supported_quantization_encodings": [ ] + }, "AimetQuantization": { "module_path": "olive.passes.onnx.aimet_quantization.AimetQuantization", "supported_providers": [ "*" ], diff --git a/olive/passes/onnx/composite_to_onnx_package.py b/olive/passes/onnx/composite_to_onnx_package.py new file mode 100644 index 0000000000..5f0814164f --- /dev/null +++ b/olive/passes/onnx/composite_to_onnx_package.py @@ -0,0 +1,309 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Convert a multi-component CompositeModel ORT-GenAI package into a flat ONNX package.""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path +from typing import TYPE_CHECKING + +import onnx + +from olive.common.utils import hardlink_copy_file +from olive.model import ONNXModelHandler +from olive.model.handler.composite import CompositeModelHandler +from olive.passes import Pass +from olive.passes.pass_config import BasePassConfig, PassConfigParam + +if TYPE_CHECKING: + from olive.hardware.accelerator import AcceleratorSpec + +logger = logging.getLogger(__name__) + + +class CompositeToOnnxPackage(Pass): + """Flatten a CompositeModel ORT-GenAI package into a single ONNXModel handler. + + MobiusBuilder and similar passes emit multi-component ORT-GenAI packages as a + :class:`CompositeModelHandler` whose components live in subdirectories:: + + output_dir/ + genai_config.json + tokenizer.json + decoder/model.onnx + vision_encoder/model.onnx + audio_encoder/model.onnx + embedding/model.onnx + + Olive's evaluators (e.g. ``OnnxEvaluator._inference_vision_genai`` and the + ``LMMSEvaluator``) detect ORT-GenAI packages by looking for ``genai_config.json`` + next to an ONNX file referenced by an :class:`ONNXModelHandler`. The nested + subdirectory layout above defeats that detection because the entry-point ONNX + file's parent (e.g. ``output_dir/decoder/``) does not contain + ``genai_config.json``. + + This pass produces an equivalent flat layout:: + + output_dir/ + genai_config.json + tokenizer.json + decoder.onnx + vision_encoder.onnx + audio_encoder.onnx + embedding.onnx + + by hardlinking each component (and its ``.onnx.data`` sidecar, if present) to + the package root and rewriting ``genai_config.json`` to reference the flat + filenames. The returned :class:`ONNXModelHandler` points at the entry-point + component (defaults to ``decoder``), so downstream evaluators can auto-detect + the package via ``Path(model.model_path).parent / "genai_config.json"``. + """ + + _accepts_composite_model = True + + @classmethod + def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassConfigParam]: + return { + "entry_point_component": PassConfigParam( + type_=str, + default_value="decoder", + description=( + "Name of the genai_config 'model' subsection (e.g. 'decoder', 'text') " + "whose ONNX file the returned ONNXModelHandler will point at. If the " + "name is not found, falls back to the first component with a 'filename' field." + ), + ), + } + + @classmethod + def is_accelerator_agnostic(cls, accelerator_spec: AcceleratorSpec) -> bool: + # Pure file-system / config rewrite — no EP-specific behavior. + return True + + def _run_for_config( + self, + model: CompositeModelHandler, + config: type[BasePassConfig], + output_model_path: str, + ) -> ONNXModelHandler: + if not isinstance(model, CompositeModelHandler): + raise ValueError( + f"CompositeToOnnxPackage expects a CompositeModelHandler input, got {type(model).__name__}." + ) + + src_dir = Path(model.model_path).resolve() + if not src_dir.is_dir(): + raise ValueError(f"CompositeModel model_path is not a directory: {src_dir}") + + src_genai_config = src_dir / "genai_config.json" + if not src_genai_config.is_file(): + raise ValueError( + f"CompositeToOnnxPackage requires genai_config.json at the package root: {src_genai_config} not found." + ) + + dst_dir = self._resolve_output_dir(output_model_path) + dst_dir.mkdir(parents=True, exist_ok=True) + + genai_config = json.loads(src_genai_config.read_text(encoding="utf-8")) + model_section = genai_config.get("model") + if not isinstance(model_section, dict): + raise ValueError(f"Invalid genai_config.json at {src_genai_config}: missing 'model' section.") + + rewrite_map = self._build_rewrite_map(model_section) + if not rewrite_map: + raise ValueError( + f"No component subsections with 'filename' found in genai_config.json at {src_genai_config}." + ) + + # Copy each component ONNX into the flat layout, rewriting external-data + # references so each initializer points at the renamed sidecar. + # + # We can't just hardlink the .onnx file and its .data sidecar to the new + # names, because each ONNX file embeds the external-data filename + # ("location" entry in the proto). After renaming, those embedded + # pointers still reference the old name (e.g. "model.onnx.data") and + # ONNX Runtime fails at load with "External data path does not exist". + # ``onnx.save_model(..., save_as_external_data=True, location=...)`` + # serializes a new ONNX file whose embedded location matches the new + # filename, and writes the corresponding .data file alongside. + for old_rel, new_name in rewrite_map.items(): + src_file = self._resolve_component_source(src_dir, old_rel) + if src_file is None: + raise ValueError(f"Component file referenced by genai_config not found: {src_dir / old_rel}") + + src_data = self._resolve_component_data(src_file) + dst_file = dst_dir / new_name + dst_data_name = f"{new_name}.data" + dst_data_file = dst_dir / dst_data_name + + if src_data is not None: + # Load model and resolve external initializer tensors so we can + # re-serialize them under the new filename. ``load_external_data + # =True`` (the default) materializes initializer bytes into the + # in-memory proto via the source directory layout, after which + # we can write them back out with a new ``location``. + onnx_model = onnx.load(str(src_file), load_external_data=True) + # Remove any pre-existing destination files to avoid onnx + # appending to a stale .data sidecar on rerun. + if dst_file.exists(): + dst_file.unlink() + if dst_data_file.exists(): + dst_data_file.unlink() + onnx.save_model( + onnx_model, + str(dst_file), + save_as_external_data=True, + all_tensors_to_one_file=True, + location=dst_data_name, + ) + else: + # No external data sidecar — model is self-contained, plain copy. + hardlink_copy_file(src_file, dst_file) + + # Copy every top-level shared sidecar (tokenizer, processor configs, chat template, etc.). + # genai_config.json is rewritten below, so skip it here. + for src_file in src_dir.iterdir(): + if not src_file.is_file() or src_file.name == "genai_config.json": + continue + dst_file = dst_dir / src_file.name + if not dst_file.exists(): + hardlink_copy_file(src_file, dst_file) + + # Update filename references and write the rewritten config. + for component_cfg in model_section.values(): + if isinstance(component_cfg, dict): + old_name = component_cfg.get("filename") + if isinstance(old_name, str) and old_name in rewrite_map: + component_cfg["filename"] = rewrite_map[old_name] + + (dst_dir / "genai_config.json").write_text( + json.dumps(genai_config, indent=2), + encoding="utf-8", + ) + + entry_filename = self._select_entry_filename(model_section, config.entry_point_component) + if entry_filename is None: + raise ValueError( + "Failed to determine an entry-point component for CompositeToOnnxPackage. " + f"Requested '{config.entry_point_component}', no component matched and no fallback available." + ) + + logger.info( + "CompositeToOnnxPackage: flattened %d components into '%s' (entry_point=%s)", + len(rewrite_map), + dst_dir, + entry_filename, + ) + + return ONNXModelHandler( + model_path=str(dst_dir), + onnx_file_name=entry_filename, + model_attributes={ + "ort_genai_package": True, + "entry_point_component": config.entry_point_component, + "flattened_from_composite": True, + **(model.model_attributes or {}), + }, + ) + + @staticmethod + def _resolve_output_dir(output_model_path: str) -> Path: + """Olive sometimes passes a `.onnx` file path; in that case use its stem as the directory.""" + output_path = Path(output_model_path) + if output_path.suffix == ".onnx": + return output_path.parent / output_path.stem + return output_path + + @staticmethod + def _resolve_component_source(src_dir: Path, old_rel: str) -> Path | None: + """Resolve the on-disk source file for a component referenced by genai_config. + + Some upstream Olive passes (notably ``OnnxKQuantQuantization`` when given a + component already named ``decoder.onnx``) save the quantized model with + the ``.onnx`` extension stripped — producing ``decoder``/``encoder`` files + next to ``decoder.data``/``encoder.data`` while ``genai_config.json`` still + references the original ``decoder.onnx``/``encoder.onnx``. Accept the + extensionless variant so we can still flatten such packages without + requiring an upstream fix. + """ + candidate = src_dir / old_rel + if candidate.is_file(): + return candidate + stripped = src_dir / Path(old_rel).stem + if stripped.is_file(): + return stripped + return None + + @staticmethod + def _resolve_component_data(src_file: Path) -> Path | None: + """Resolve the external-data sidecar for a component source file. + + Tries ``.data`` first (matches ONNX's default ``.data`` + sidecar). For extensionless source files emitted by buggy upstream + passes, also accepts ``.data`` (e.g. ``decoder`` + ``decoder.data``). + """ + primary = src_file.with_name(src_file.name + ".data") + if primary.is_file(): + return primary + if src_file.suffix == "": + alt = src_file.with_name(src_file.stem + ".data") + if alt.is_file(): + return alt + return None + + @staticmethod + def _build_rewrite_map(model_section: dict) -> dict[str, str]: + """Map each old relative filename to a unique flat root-level filename. + + Uses the immediate parent directory name when the component lives in a + subdirectory (``decoder/model.onnx`` -> ``decoder.onnx``). Falls back to + the genai_config key when the file is already flat or the parent name + collides. Guarantees uniqueness by appending a counter if needed. + """ + used_names: set[str] = set() + rewrite_map: dict[str, str] = {} + + for component_key, component_cfg in model_section.items(): + if not isinstance(component_cfg, dict): + continue + old_path = component_cfg.get("filename") + if not isinstance(old_path, str) or not old_path: + continue + if old_path in rewrite_map: + continue + + old_path_obj = Path(old_path) + parent_name = old_path_obj.parent.name + candidate_base = parent_name or component_key + candidate = f"{candidate_base}.onnx" + + if candidate in used_names: + suffix = 1 + while f"{candidate_base}_{suffix}.onnx" in used_names: + suffix += 1 + candidate = f"{candidate_base}_{suffix}.onnx" + + used_names.add(candidate) + rewrite_map[old_path] = candidate + + return rewrite_map + + @staticmethod + def _select_entry_filename(model_section: dict, entry_point_component: str) -> str | None: + """Pick the flat filename for the entry-point component, falling back if missing.""" + preferred = model_section.get(entry_point_component) + if isinstance(preferred, dict): + filename = preferred.get("filename") + if isinstance(filename, str): + return filename + + for component_cfg in model_section.values(): + if isinstance(component_cfg, dict): + filename = component_cfg.get("filename") + if isinstance(filename, str): + return filename + return None diff --git a/test/evaluator/test_lmms_ort.py b/test/evaluator/test_lmms_ort.py index 408be890ad..595b158224 100644 --- a/test/evaluator/test_lmms_ort.py +++ b/test/evaluator/test_lmms_ort.py @@ -62,6 +62,90 @@ def test_normalize_execution_provider(execution_provider, expected): assert _normalize_execution_provider(execution_provider) == expected +def _make_evaluator_for_prompt_tests(): + """Construct an LMMSORTGenAIEvaluator with __init__ skipped. + + Lets us unit-test the prompt-building path without needing a real ORT-GenAI + model on disk. + """ + inst = LMMSORTGenAIEvaluator.__new__(LMMSORTGenAIEvaluator) + inst._tokenizer = MagicMock(name="og.Tokenizer") + inst._model_type = "test_model" + inst.system_prompt = "You are helpful." + inst.prompt_template = None + inst.image_token_format = "<|image_{index}|>" + inst.audio_token_format = "<|audio_{index}|>" + inst._has_chat_template = True + return inst + + +def test_build_prompt_for_request_uses_apply_chat_template_by_default(): + inst = _make_evaluator_for_prompt_tests() + inst._tokenizer.apply_chat_template.return_value = "" + + out = inst._build_prompt_for_request("What is in the image?", num_images=1, num_audios=0) + + assert out == "" + inst._tokenizer.apply_chat_template.assert_called_once() + messages_json_arg = inst._tokenizer.apply_chat_template.call_args.args[0] + assert inst._tokenizer.apply_chat_template.call_args.kwargs.get("add_generation_prompt") is True + + import json as _json + + messages = _json.loads(messages_json_arg) + assert messages[0] == {"role": "system", "content": "You are helpful."} + assert messages[1] == {"role": "user", "content": "<|image_1|>What is in the image?"} + + +def test_build_prompt_for_request_skips_system_when_empty(): + inst = _make_evaluator_for_prompt_tests() + inst.system_prompt = "" + inst._tokenizer.apply_chat_template.return_value = "out" + + inst._build_prompt_for_request("Q", num_images=0, num_audios=0) + + import json as _json + + messages = _json.loads(inst._tokenizer.apply_chat_template.call_args.args[0]) + assert all(m["role"] != "system" for m in messages) + assert messages[-1] == {"role": "user", "content": "Q"} + + +def test_build_prompt_for_request_includes_audio_markers(): + inst = _make_evaluator_for_prompt_tests() + inst.system_prompt = "" + inst._tokenizer.apply_chat_template.return_value = "out" + + inst._build_prompt_for_request("Q", num_images=2, num_audios=1) + + import json as _json + + messages = _json.loads(inst._tokenizer.apply_chat_template.call_args.args[0]) + assert messages[0]["content"] == "<|image_1|><|image_2|><|audio_1|>Q" + + +def test_build_prompt_for_request_falls_back_to_legacy_when_prompt_template_set(): + inst = _make_evaluator_for_prompt_tests() + inst.prompt_template = "{system_prompt}|{user_content}" + + out = inst._build_prompt_for_request("Q", num_images=1, num_audios=0) + + assert out == "You are helpful.|<|image_1|>Q" + inst._tokenizer.apply_chat_template.assert_not_called() + + +def test_build_prompt_for_request_falls_back_when_chat_template_unavailable(): + inst = _make_evaluator_for_prompt_tests() + inst._has_chat_template = False # simulate older onnxruntime-genai + + out = inst._build_prompt_for_request("Q", num_images=1, num_audios=0) + + # Default legacy template wraps with Phi-4-MM-style tokens. + assert "<|image_1|>" in out + assert "Q" in out + inst._tokenizer.apply_chat_template.assert_not_called() + + def test_lmms_evaluator_converts_lmms_results(tmp_path): model_dir = tmp_path / "model" model_dir.mkdir() diff --git a/test/passes/onnx/test_composite_to_onnx_package.py b/test/passes/onnx/test_composite_to_onnx_package.py new file mode 100644 index 0000000000..8a6ea02b96 --- /dev/null +++ b/test/passes/onnx/test_composite_to_onnx_package.py @@ -0,0 +1,325 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for the CompositeToOnnxPackage pass.""" + +import json +from pathlib import Path + +import numpy as np +import onnx +import pytest +from onnx import TensorProto, helper, numpy_helper + +from olive.model import ONNXModelHandler +from olive.model.handler.composite import CompositeModelHandler +from olive.passes.olive_pass import create_pass_from_dict +from olive.passes.onnx.composite_to_onnx_package import CompositeToOnnxPackage + + +def _write_tiny_onnx_with_external_data(onnx_path: Path, data_filename: str = "model.onnx.data") -> None: + """Write a minimal valid ONNX model whose single initializer lives in an external data sidecar. + + The initializer is sized above onnx's default external-data size threshold + (1024 bytes) so the .data sidecar actually gets written. The model itself + stays tiny (one Identity node) so the test fixture remains cheap. + """ + # 1024 floats = 4096 bytes, well above the default 1024-byte threshold for + # promoting an initializer to external storage. + data = np.arange(1024, dtype=np.float32) + init_tensor = numpy_helper.from_array(data, name="weight") + output = helper.make_tensor_value_info("y", TensorProto.FLOAT, [1024]) + node = helper.make_node("Identity", inputs=["weight"], outputs=["y"]) + graph = helper.make_graph([node], "g", inputs=[], outputs=[output], initializer=[init_tensor]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 17)]) + + onnx_path.parent.mkdir(parents=True, exist_ok=True) + onnx.save_model( + model, + str(onnx_path), + save_as_external_data=True, + all_tensors_to_one_file=True, + location=data_filename, + ) + + +def _write_tiny_inline_onnx(onnx_path: Path) -> None: + """Write a minimal self-contained (no external data) ONNX model.""" + init_tensor = numpy_helper.from_array(np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32), name="weight") + output = helper.make_tensor_value_info("y", TensorProto.FLOAT, [4]) + node = helper.make_node("Identity", inputs=["weight"], outputs=["y"]) + graph = helper.make_graph([node], "g", inputs=[], outputs=[output], initializer=[init_tensor]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 17)]) + onnx_path.parent.mkdir(parents=True, exist_ok=True) + onnx.save_model(model, str(onnx_path)) + + +def _make_nested_genai_package( + root: Path, + components: dict[str, str], + *, + with_external_data: bool = True, +) -> Path: + """Build a fake nested ORT-GenAI package at ``root``. + + ``components`` maps genai_config component keys (e.g. ``decoder``) to the + relative ONNX filename under ``root`` (e.g. ``decoder/model.onnx``). Each + component file is a real (tiny) ONNX model so the pass exercises real + external-data rewriting rather than file rename only. + """ + root.mkdir(parents=True, exist_ok=True) + + model_section: dict[str, dict[str, str]] = {} + for key, rel_path in components.items(): + component_file = root / rel_path + if with_external_data: + _write_tiny_onnx_with_external_data(component_file) + else: + _write_tiny_inline_onnx(component_file) + model_section[key] = {"filename": rel_path} + + # Shared root-level sidecars. + (root / "tokenizer.json").write_text("{}", encoding="utf-8") + (root / "chat_template.jinja").write_text("template", encoding="utf-8") + + (root / "genai_config.json").write_text( + json.dumps({"model": {"type": "gemma4", **model_section}}, indent=2), + encoding="utf-8", + ) + return root + + +def _make_composite_handler(root: Path, components: dict[str, str]) -> CompositeModelHandler: + component_handlers = [ONNXModelHandler(model_path=str(root / rel_path)) for rel_path in components.values()] + return CompositeModelHandler( + model_components=component_handlers, + model_component_names=list(components.keys()), + model_path=str(root), + ) + + +class TestCompositeToOnnxPackage: + def test_flattens_nested_package_to_root_level_filenames(self, tmp_path): + src_root = _make_nested_genai_package( + tmp_path / "src", + { + "decoder": "decoder/model.onnx", + "vision": "vision_encoder/model.onnx", + "audio": "audio_encoder/model.onnx", + "embedding": "embedding/model.onnx", + }, + ) + composite = _make_composite_handler( + src_root, + { + "decoder": "decoder/model.onnx", + "vision_encoder": "vision_encoder/model.onnx", + "audio_encoder": "audio_encoder/model.onnx", + "embedding": "embedding/model.onnx", + }, + ) + + p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True) + out = p.run(composite, str(tmp_path / "out")) + + out_dir = Path(out.model_path).parent + assert (out_dir / "decoder.onnx").is_file() + assert (out_dir / "vision_encoder.onnx").is_file() + assert (out_dir / "audio_encoder.onnx").is_file() + assert (out_dir / "embedding.onnx").is_file() + assert (out_dir / "genai_config.json").is_file() + assert (out_dir / "tokenizer.json").is_file() + assert (out_dir / "chat_template.jinja").is_file() + + def test_rewrites_genai_config_filenames(self, tmp_path): + src_root = _make_nested_genai_package( + tmp_path / "src", + {"decoder": "decoder/model.onnx", "vision": "vision_encoder/model.onnx"}, + ) + composite = _make_composite_handler( + src_root, + { + "decoder": "decoder/model.onnx", + "vision_encoder": "vision_encoder/model.onnx", + }, + ) + + p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True) + out = p.run(composite, str(tmp_path / "out")) + + rewritten = json.loads((Path(out.model_path).parent / "genai_config.json").read_text(encoding="utf-8")) + assert rewritten["model"]["decoder"]["filename"] == "decoder.onnx" + assert rewritten["model"]["vision"]["filename"] == "vision_encoder.onnx" + + def test_returns_onnx_handler_with_entry_point_next_to_genai_config(self, tmp_path): + src_root = _make_nested_genai_package( + tmp_path / "src", + {"decoder": "decoder/model.onnx", "vision": "vision_encoder/model.onnx"}, + ) + composite = _make_composite_handler( + src_root, + { + "decoder": "decoder/model.onnx", + "vision_encoder": "vision_encoder/model.onnx", + }, + ) + + p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True) + out = p.run(composite, str(tmp_path / "out")) + + assert isinstance(out, ONNXModelHandler) + # Evaluator-style auto-detection: parent of model_path must contain genai_config.json + parent = Path(out.model_path).parent + assert (parent / "genai_config.json").is_file() + assert Path(out.model_path).name == "decoder.onnx" + + def test_rewrites_external_data_location_to_new_filename(self, tmp_path): + """External-data references inside each component ONNX must point at the renamed sidecar. + + Regression test for a real-world failure: hardlinking a .onnx file + + its .data sidecar to new names left the embedded "location" pointer + inside the proto pointing at the old name (e.g. "model.onnx.data"), + causing ONNX Runtime to fail at load with "External data path does not + exist". The pass must rewrite each component model's external-data + location to match its new flat filename, and produce a real .data file + with the new name alongside it. + """ + src_root = _make_nested_genai_package(tmp_path / "src", {"decoder": "decoder/model.onnx"}) + composite = _make_composite_handler(src_root, {"decoder": "decoder/model.onnx"}) + + p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True) + out = p.run(composite, str(tmp_path / "out")) + + out_dir = Path(out.model_path).parent + # Both the flat ONNX file and the matching renamed sidecar must exist. + assert (out_dir / "decoder.onnx").is_file() + assert (out_dir / "decoder.onnx.data").is_file() + + # The embedded external-data location inside the rewritten ONNX file + # must reference the new sidecar name, not the source layout's + # "model.onnx.data". Load without materializing external data so the + # initializer keeps its ``external_data`` pointer rather than getting + # the bytes inlined as ``raw_data``. + proto_only = onnx.load(str(out_dir / "decoder.onnx"), load_external_data=False) + weight = next(t for t in proto_only.graph.initializer if t.name == "weight") + location_entries = [entry.value for entry in weight.external_data if entry.key == "location"] + assert location_entries == ["decoder.onnx.data"], ( + f"expected location='decoder.onnx.data', got external_data={list(weight.external_data)}" + ) + + # And the bytes should actually load through that new pointer (catches + # the case where the .data file was written under the right name but + # corrupted, or vice versa). + materialized = onnx.load(str(out_dir / "decoder.onnx"), load_external_data=True) + loaded_weight = next(t for t in materialized.graph.initializer if t.name == "weight") + loaded_array = numpy_helper.to_array(loaded_weight) + assert loaded_array.shape == (1024,) + assert loaded_array[0] == 0.0 + assert loaded_array[-1] == 1023.0 + + def test_handles_inline_onnx_without_external_data(self, tmp_path): + """Self-contained ONNX models (no .data sidecar) should still flatten correctly.""" + src_root = _make_nested_genai_package( + tmp_path / "src", + {"decoder": "decoder/model.onnx"}, + with_external_data=False, + ) + composite = _make_composite_handler(src_root, {"decoder": "decoder/model.onnx"}) + + p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True) + out = p.run(composite, str(tmp_path / "out")) + + out_dir = Path(out.model_path).parent + assert (out_dir / "decoder.onnx").is_file() + # No external-data sidecar should be present since the source had none. + assert not (out_dir / "decoder.onnx.data").exists() + + def test_uses_fallback_entry_point_when_requested_one_missing(self, tmp_path): + src_root = _make_nested_genai_package( + tmp_path / "src", + {"vision": "vision_encoder/model.onnx", "embedding": "embedding/model.onnx"}, + ) + composite = _make_composite_handler( + src_root, + { + "vision_encoder": "vision_encoder/model.onnx", + "embedding": "embedding/model.onnx", + }, + ) + + # The default entry_point_component is "decoder", which doesn't exist here. + p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True) + out = p.run(composite, str(tmp_path / "out")) + + assert Path(out.model_path).name in {"vision_encoder.onnx", "embedding.onnx"} + + def test_honors_explicit_entry_point_component(self, tmp_path): + src_root = _make_nested_genai_package( + tmp_path / "src", + {"decoder": "decoder/model.onnx", "embedding": "embedding/model.onnx"}, + ) + composite = _make_composite_handler( + src_root, + { + "decoder": "decoder/model.onnx", + "embedding": "embedding/model.onnx", + }, + ) + + p = create_pass_from_dict( + CompositeToOnnxPackage, + {"entry_point_component": "embedding"}, + disable_search=True, + ) + out = p.run(composite, str(tmp_path / "out")) + + assert Path(out.model_path).name == "embedding.onnx" + + def test_rejects_package_without_genai_config(self, tmp_path): + src_root = tmp_path / "src" + src_root.mkdir() + _write_tiny_inline_onnx(src_root / "decoder" / "model.onnx") + composite = _make_composite_handler(src_root, {"decoder": "decoder/model.onnx"}) + + p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True) + with pytest.raises(ValueError, match=r"genai_config\.json"): + p.run(composite, str(tmp_path / "out")) + + def test_handles_unique_collision_in_subdir_names(self, tmp_path): + # Two components living in subdirs with the same internal filename shouldn't collide. + src_root = tmp_path / "src" + src_root.mkdir() + _write_tiny_inline_onnx(src_root / "model_a" / "model.onnx") + _write_tiny_inline_onnx(src_root / "model_b" / "model.onnx") + (src_root / "genai_config.json").write_text( + json.dumps( + { + "model": { + "first": {"filename": "model_a/model.onnx"}, + "second": {"filename": "model_b/model.onnx"}, + } + } + ), + encoding="utf-8", + ) + + composite = CompositeModelHandler( + model_components=[ + ONNXModelHandler(model_path=str(src_root / "model_a" / "model.onnx")), + ONNXModelHandler(model_path=str(src_root / "model_b" / "model.onnx")), + ], + model_component_names=["first", "second"], + model_path=str(src_root), + ) + + p = create_pass_from_dict( + CompositeToOnnxPackage, + {"entry_point_component": "first"}, + disable_search=True, + ) + out = p.run(composite, str(tmp_path / "out")) + + out_dir = Path(out.model_path).parent + assert (out_dir / "model_a.onnx").is_file() + assert (out_dir / "model_b.onnx").is_file() From c958c04ad97fc2d8cc1ea0cbfd536775ad088ff3 Mon Sep 17 00:00:00 2001 From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com> Date: Tue, 16 Jun 2026 08:40:38 +0000 Subject: [PATCH 3/6] mobiusbuilder: add EP override; lmms_ort: AudioDecoder + per-model processor args - MobiusBuilder: add `mobius_ep_override` config knob. Lets a workflow force the mobius execution_provider (e.g. "default") independent of the Olive accelerator EP. Needed because mobius's cuda-EP attention fusions (PackedMultiHeadAttention for Qwen2.5-VL vision, GQA for Gemma-4 decoder) produce graphs the ORT-GenAI fused-attention kernels reject. "default" EP skips those fusions; the resulting INT4 graph is numerically equivalent. - lmms_ort: support torchcodec.AudioDecoder visuals (HF datasets 5.x audio feature) in _normalize_audio via duck-typed get_all_samples(). - lmms_ort: branch processor-arg shape on model type - Phi-4-MM needs a bare string, Whisper needs [prompt]. Passing a list to Phi-4-MM raised "Number of image tokens does not match the number of images". Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- olive/evaluator/lmms_ort.py | 31 ++++++++++++++++++++--- olive/passes/onnx/mobius_model_builder.py | 30 ++++++++++++++++++++++ 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/olive/evaluator/lmms_ort.py b/olive/evaluator/lmms_ort.py index 635f20cf70..b4ef57f473 100644 --- a/olive/evaluator/lmms_ort.py +++ b/olive/evaluator/lmms_ort.py @@ -111,6 +111,22 @@ def _normalize_audio(visual) -> tuple[np.ndarray, int] | None: return _load_audio_file(Path(visual["path"])) if isinstance(visual, (str, Path)): return _load_audio_file(Path(visual)) + # torchcodec.decoders.AudioDecoder — HF datasets 5.x returns this for the + # "audio" feature instead of the legacy {"array", "sampling_rate"} dict. + # Detect by duck-typing the get_all_samples() method to avoid a hard + # torchcodec import (it's an optional install). + if hasattr(visual, "get_all_samples"): + try: + samples = visual.get_all_samples() + # samples.data is a torch.Tensor of shape [channels, num_samples]. + # ORT-GenAI's processor wants mono float32; downmix if multichannel. + arr = samples.data.detach().cpu().numpy().astype(np.float32) + if arr.ndim == 2: + arr = arr.mean(axis=0) + return arr, int(samples.sample_rate) + except Exception as e: # pragma: no cover + logger.warning("Failed to decode AudioDecoder visual: %s", e) + return None return None @@ -380,10 +396,17 @@ def _run_generation( og_audios = self._build_og_audios(audios, tmp_dir) try: - # ORT-GenAI processors accept either a bare string or a list of - # strings depending on backend; benchmark_multimodal.py wraps in - # a list, which matches both the whisper and phi4mm paths. - inputs = self._processor([prompt], images=og_images, audios=og_audios) + # ORT-GenAI multimodal processors disagree on argument shape: + # - Phi-4-MM expects a bare string. Passing [prompt] raises + # "Number of image tokens does not match the number of images" + # because the processor interprets the list as one prompt per + # image (verified against pre-built phi4mm INT4 package). + # - Whisper's processor (per ORT-GenAI's reference + # benchmark_multimodal.py) is exercised with a list of + # prompts. + # Branch on model type rather than guess. + processor_input = [prompt] if self._model_type == "whisper" else prompt + inputs = self._processor(processor_input, images=og_images, audios=og_audios) except Exception as e: # pragma: no cover del generator return self._handle_error("ORT-GenAI multimodal processor failed.", e, "") diff --git a/olive/passes/onnx/mobius_model_builder.py b/olive/passes/onnx/mobius_model_builder.py index e2d10fc0c4..df11cc4ca4 100644 --- a/olive/passes/onnx/mobius_model_builder.py +++ b/olive/passes/onnx/mobius_model_builder.py @@ -91,6 +91,25 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon "quantization pass (e.g. OnnxMatMulNBits) after this pass." ), ), + "mobius_ep_override": PassConfigParam( + type_=str, + required=False, + default_value=None, + description=( + "Override the mobius execution_provider regardless of the " + "Olive accelerator EP. Useful as a workaround when mobius's " + "EP-specific attention fusions produce graphs that the " + "ORT-GenAI fused-attention kernels reject at runtime. For " + "example, the mobius cuda EP fuses PackedMultiHeadAttention " + "in the Qwen2.5-VL vision encoder (whose internally-computed " + "cumulative_sequence_length does not satisfy the kernel's " + "expected shape) and GroupQueryAttention in the Gemma-4 " + "decoder (where the bidirectional vision-block mask makes GQA " + "invalid). Set to 'default' to skip all EP-specific fusions. " + "The resulting INT4 graph is numerically equivalent; only " + "perf-related fusions are dropped." + ), + ), } def _run_for_config( @@ -120,6 +139,17 @@ def _run_for_config( self.accelerator_spec.accelerator_type, ) + # Honor the explicit override. Logged at WARNING so the workaround is + # visible in run output. + if config.mobius_ep_override is not None: + logger.warning( + "MobiusBuilder: mobius_ep_override set; overriding mobius EP " + "from '%s' to '%s' (auto-derived EP discarded).", + ep_str, + config.mobius_ep_override, + ) + ep_str = config.mobius_ep_override + dtype_str: str = _PRECISION_TO_DTYPE.get(config.precision, "f32") model_id: str = model.model_name_or_path From ff38a07da735e128430b75c1da66f398ec1b8efe Mon Sep 17 00:00:00 2001 From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com> Date: Thu, 18 Jun 2026 19:20:01 +0000 Subject: [PATCH 4/6] remove mobius_ep_override (obsolete) --- olive/passes/onnx/mobius_model_builder.py | 30 ----------------------- 1 file changed, 30 deletions(-) diff --git a/olive/passes/onnx/mobius_model_builder.py b/olive/passes/onnx/mobius_model_builder.py index df11cc4ca4..e2d10fc0c4 100644 --- a/olive/passes/onnx/mobius_model_builder.py +++ b/olive/passes/onnx/mobius_model_builder.py @@ -91,25 +91,6 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon "quantization pass (e.g. OnnxMatMulNBits) after this pass." ), ), - "mobius_ep_override": PassConfigParam( - type_=str, - required=False, - default_value=None, - description=( - "Override the mobius execution_provider regardless of the " - "Olive accelerator EP. Useful as a workaround when mobius's " - "EP-specific attention fusions produce graphs that the " - "ORT-GenAI fused-attention kernels reject at runtime. For " - "example, the mobius cuda EP fuses PackedMultiHeadAttention " - "in the Qwen2.5-VL vision encoder (whose internally-computed " - "cumulative_sequence_length does not satisfy the kernel's " - "expected shape) and GroupQueryAttention in the Gemma-4 " - "decoder (where the bidirectional vision-block mask makes GQA " - "invalid). Set to 'default' to skip all EP-specific fusions. " - "The resulting INT4 graph is numerically equivalent; only " - "perf-related fusions are dropped." - ), - ), } def _run_for_config( @@ -139,17 +120,6 @@ def _run_for_config( self.accelerator_spec.accelerator_type, ) - # Honor the explicit override. Logged at WARNING so the workaround is - # visible in run output. - if config.mobius_ep_override is not None: - logger.warning( - "MobiusBuilder: mobius_ep_override set; overriding mobius EP " - "from '%s' to '%s' (auto-derived EP discarded).", - ep_str, - config.mobius_ep_override, - ) - ep_str = config.mobius_ep_override - dtype_str: str = _PRECISION_TO_DTYPE.get(config.precision, "f32") model_id: str = model.model_name_or_path From c13a68340aa1d1983888d314769648d98dfb36df Mon Sep 17 00:00:00 2001 From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com> Date: Fri, 19 Jun 2026 17:59:19 +0000 Subject: [PATCH 5/6] evaluator: drop lmms-eval entry point to minimize packaging surface Remove the setup.py lmms_eval.models entry point, the _model_manifest factory, and its registration tests. The Olive LMMSEvaluator path imports LMMSORTGenAIEvaluator directly, so the entry point only affected the standalone lmms-eval CLI; dropping it keeps setup.py out of this change. --- olive/evaluator/lmms_ort.py | 26 ------------------------- setup.py | 3 --- test/evaluator/test_lmms_ort.py | 34 --------------------------------- 3 files changed, 63 deletions(-) diff --git a/olive/evaluator/lmms_ort.py b/olive/evaluator/lmms_ort.py index b4ef57f473..3ee00fd685 100644 --- a/olive/evaluator/lmms_ort.py +++ b/olive/evaluator/lmms_ort.py @@ -629,29 +629,3 @@ def loglikelihood(self, requests: list[Instance], disable_tqdm: bool = False) -> def generate_until_multi_round(self, requests) -> list[str]: raise NotImplementedError("ortgenai_mm does not support lmms-eval multi-round generation yet.") - - -# ----------------------------------------------------------------------------- -# lmms-eval MODEL_REGISTRY_V2 entry-point factory. -# -# Exposed via setup.py entry_points["lmms_eval.models"], so a fresh install of -# olive-ai makes ``--model ortgenai_mm`` discoverable from the lmms-eval CLI -# (e.g. ``python -m lmms_eval --model ortgenai_mm ...``) without requiring the -# caller to import this module first. -# -# lmms-eval's ``ModelRegistryV2.load_entrypoint_manifests`` accepts a -# ``Callable`` payload, so we keep the import of ``ModelManifest`` lazy. That -# way ``olive`` (and the rest of this module) stays importable when lmms-eval -# is not installed. -# ----------------------------------------------------------------------------- -def _model_manifest(): - """Return the lmms-eval ModelManifest for ``ortgenai_mm``. - - Used as an entry-point payload for lmms-eval's MODEL_REGISTRY_V2. - """ - from lmms_eval.models.registry_v2 import ModelManifest - - return ModelManifest( - model_id="ortgenai_mm", - simple_class_path="olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator", - ) diff --git a/setup.py b/setup.py index 798010301d..b4aebf070a 100644 --- a/setup.py +++ b/setup.py @@ -88,8 +88,5 @@ def get_extra_deps(rel_path): data_files=[], entry_points={ "console_scripts": ["olive=olive.cli.launcher:main"], - "lmms_eval.models": [ - "ortgenai_mm = olive.evaluator.lmms_ort:_model_manifest", - ], }, ) diff --git a/test/evaluator/test_lmms_ort.py b/test/evaluator/test_lmms_ort.py index 595b158224..630b444cba 100644 --- a/test/evaluator/test_lmms_ort.py +++ b/test/evaluator/test_lmms_ort.py @@ -19,7 +19,6 @@ from olive.evaluator.lmms_ort import ( LMMSORTGenAIEvaluator, _build_prompt, - _model_manifest, _normalize_execution_provider, ) from olive.evaluator.olive_evaluator import LMMSEvaluator @@ -519,39 +518,6 @@ def test_lmms_ort_genai_evaluator_is_simple_flag_matches_registration(): assert LMMSORTGenAIEvaluator.is_simple is True -def test_model_manifest_factory_returns_expected_manifest(): - """Verify the entry-point payload points at LMMSORTGenAIEvaluator.""" - pytest.importorskip("lmms_eval") - - manifest = _model_manifest() - - assert manifest.model_id == "ortgenai_mm" - assert manifest.simple_class_path == "olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator" - assert manifest.chat_class_path is None - - -def test_olive_ai_registers_ortgenai_mm_entry_point(): - """Verify olive-ai exposes ortgenai_mm via the lmms_eval.models entry-point group.""" - from importlib.metadata import entry_points - - eps = {ep.name: ep.value for ep in entry_points(group="lmms_eval.models")} - assert eps.get("ortgenai_mm") == "olive.evaluator.lmms_ort:_model_manifest" - - -def test_model_registry_v2_resolves_ortgenai_mm(): - """Verify lmms-eval's MODEL_REGISTRY_V2 resolves ortgenai_mm via the entry point.""" - pytest.importorskip("lmms_eval") - from lmms_eval.models import MODEL_REGISTRY_V2 - - resolved = MODEL_REGISTRY_V2.resolve("ortgenai_mm") - assert resolved.model_id == "ortgenai_mm" - assert resolved.model_type == "simple" - assert resolved.class_path == "olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator" - - cls = MODEL_REGISTRY_V2.get_model_class("ortgenai_mm") - assert cls is LMMSORTGenAIEvaluator - - # ----------------------------------------------------------------------------- # Visual partitioning # ----------------------------------------------------------------------------- From 10eee66f6a21de242f255ff370e41951b742a8d9 Mon Sep 17 00:00:00 2001 From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com> Date: Tue, 23 Jun 2026 17:25:57 +0000 Subject: [PATCH 6/6] evaluator: auto-detect structured-content chat template support via probe --- olive/evaluator/lmms_ort.py | 113 ++++++++++++++++++++++------- olive/evaluator/olive_evaluator.py | 17 ++--- test/evaluator/test_lmms_ort.py | 87 +++++++++++++++++++++- 3 files changed, 182 insertions(+), 35 deletions(-) diff --git a/olive/evaluator/lmms_ort.py b/olive/evaluator/lmms_ort.py index 3ee00fd685..03a0e0bbae 100644 --- a/olive/evaluator/lmms_ort.py +++ b/olive/evaluator/lmms_ort.py @@ -247,8 +247,8 @@ def __init__( provider_options: dict | None = None, fail_on_error: bool = True, prompt_template: str | None = None, - image_token_format: str = "<|image_{index}|>", - audio_token_format: str = "<|audio_{index}|>", + image_token_format: str | None = None, + audio_token_format: str | None = None, **kwargs, ) -> None: if _LMMS_EVAL_IMPORT_ERROR is not None: @@ -325,6 +325,14 @@ def __init__( except json.JSONDecodeError as e: raise ValueError(f"Invalid genai_config.json in {self.model_dir}") from e + # Probe (once) whether this model's chat template injects media tokens + # from structured content parts (``{"type": "image"}``). Well-behaved + # templates (Gemma-4, Qwen2.5-VL, Qwen3-VL) do; Phi-4-MM's template + # stringifies the content list as Python repr instead. When supported, + # the adapter lets the template emit the correct per-model media tokens + # automatically, so the user does not need to set ``image_token_format``. + self._supports_structured_content = self._probe_structured_content_support() + self._rank = 0 self._world_size = 1 logger.info("Model loaded. Model type: %s", self._model_type) @@ -518,28 +526,72 @@ def _score_continuation(self, prompt: str, continuation: str, images, audios) -> del generator return total_logprob, all_greedy + _DEFAULT_IMAGE_TOKEN_FORMAT = "<|image_{index}|>" + _DEFAULT_AUDIO_TOKEN_FORMAT = "<|audio_{index}|>" + + def _probe_structured_content_support(self) -> bool: + """Detect whether the model's chat template injects media tokens from structured content. + + Renders a probe message whose content is a list of typed parts + (``[{"type": "image"}, {"type": "text", ...}]``) and checks the result: + + - Well-behaved templates (Gemma-4, Qwen2.5-VL, Qwen3-VL) replace the + image part with the model's own media token (e.g. ``<|image|>`` or + ``<|vision_start|>...``), so the rendered string contains no Python + dict repr. + - Broken templates (Phi-4-MM) stringify the list as Python repr, so the + rendered string contains ``{'type'`` / ``"type"``. + + Probed once at load. Returns False if the tokenizer has no + ``apply_chat_template`` or the probe raises. + """ + if not self._has_chat_template: + return False + try: + probe = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "x"}]}] + rendered = self._tokenizer.apply_chat_template(json.dumps(probe), add_generation_prompt=True) + except Exception as e: # pragma: no cover - defensive + logger.debug("Structured-content probe failed; falling back to pre-render: %s", e) + return False + # A broken template leaks the Python dict repr of the content parts. + return "{'type'" not in rendered and '"type"' not in rendered + + def _build_structured_chat_prompt(self, user_text: str, num_images: int, num_audios: int) -> str: + """Build the prompt via structured content parts so the template injects media tokens. + + Only used when :meth:`_probe_structured_content_support` returned True + and the user did not override the media token formats. + """ + content: list[dict[str, Any]] = [] + content.extend({"type": "image"} for _ in range(num_images)) + content.extend({"type": "audio"} for _ in range(num_audios)) + content.append({"type": "text", "text": user_text}) + + messages: list[dict[str, Any]] = [] + if self.system_prompt: + messages.append({"role": "system", "content": self.system_prompt}) + messages.append({"role": "user", "content": content}) + return self._tokenizer.apply_chat_template(json.dumps(messages), add_generation_prompt=True) + def _build_prompt_for_request(self, user_text: str, num_images: int, num_audios: int) -> str: """Build the final prompt string fed to ``og.MultiModalProcessor``. - Default path: pre-render image/audio markers into the user content - string using ``image_token_format`` / ``audio_token_format``, then call - ``og.Tokenizer.apply_chat_template`` to add the model-specific chat - scaffolding (system/user/assistant turn markers). - - Pure content-parts (``{"type": "image"}``) is what PR #2488 and the - olive-recipes Qwen2.5-VL eval scripts do, and it works for chat - templates that understand structured content (Qwen2.5-VL, Qwen3-VL, - Gemma-4). However, Phi-4-MM's chat template stringifies content lists - as Python repr (verified: produces - ``<|user|>[{'type': 'image'}, ...]<|end|>`` instead of injecting - ``<|image_1|>``). Pre-rendering the markers ourselves before - ``apply_chat_template`` works for both conventions, since templates - that just pass through user content render identically either way. - - Fallback path: ``_build_prompt`` legacy format-string. Used when the - user has explicitly set ``prompt_template`` in the evaluator config - (to override per-benchmark) or when the underlying onnxruntime-genai - version predates ``apply_chat_template`` on ``og.Tokenizer``. + Path selection: + + 1. **Whisper**: no chat template; return the decoder-start token sequence. + 2. **Explicit override / no chat template**: legacy ``_build_prompt`` + format-string path (when the user set ``prompt_template`` or the + onnxruntime-genai version predates ``apply_chat_template``). + 3. **Structured content** (preferred): when the model's chat template + injects media tokens from structured content parts (auto-detected by + :meth:`_probe_structured_content_support`) and the user did not set + ``image_token_format`` / ``audio_token_format``. The template emits + the correct per-model media tokens (e.g. ``<|image|>`` for Gemma-4, + ``<|vision_start|>...`` for Qwen2.5-VL) — no per-model config needed. + 4. **Pre-render** (fallback): pre-render media markers into a flat user + string, then ``apply_chat_template``. Used for templates that + stringify structured content (Phi-4-MM) or when the user explicitly + supplies a media token format. """ if self._model_type == "whisper": # Whisper has no chat template; the "prompt" is just the decoder-start @@ -555,12 +607,23 @@ def _build_prompt_for_request(self, user_text: str, num_images: int, num_audios: user_text, self.system_prompt, self.prompt_template, - self.image_token_format, - self.audio_token_format, + self.image_token_format or self._DEFAULT_IMAGE_TOKEN_FORMAT, + self.audio_token_format or self._DEFAULT_AUDIO_TOKEN_FORMAT, ) - image_markers = "".join(_format_media_tokens(num_images, self.image_token_format)) - audio_markers = "".join(_format_media_tokens(num_audios, self.audio_token_format)) + # Prefer structured content when the template supports it AND the user + # did not pin a specific media token format. This lets well-behaved + # templates inject their own correct tokens without per-model config. + user_pinned_tokens = self.image_token_format is not None or self.audio_token_format is not None + if self._supports_structured_content and not user_pinned_tokens: + return self._build_structured_chat_prompt(user_text, num_images, num_audios) + + image_markers = "".join( + _format_media_tokens(num_images, self.image_token_format or self._DEFAULT_IMAGE_TOKEN_FORMAT) + ) + audio_markers = "".join( + _format_media_tokens(num_audios, self.audio_token_format or self._DEFAULT_AUDIO_TOKEN_FORMAT) + ) user_content = f"{image_markers}{audio_markers}{user_text}" messages: list[dict[str, Any]] = [] diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py index ee7d63286e..d2f27c07d6 100644 --- a/olive/evaluator/olive_evaluator.py +++ b/olive/evaluator/olive_evaluator.py @@ -2082,15 +2082,14 @@ def __init__(self, tasks: list[str], **kwargs): self.output_path = kwargs.get("output_path") self.fail_on_error = bool(kwargs.get("fail_on_error", True)) self.prompt_template = kwargs.get("prompt_template") - self.image_token_format = kwargs.get("image_token_format", "<|image_{index}|>") - self.audio_token_format = kwargs.get("audio_token_format", "<|audio_{index}|>") - # NOTE: ``prompt_template`` / ``image_token_format`` / ``audio_token_format`` - # are legacy format-string knobs and should rarely be needed. By default - # the ortgenai_mm adapter calls ``og.Tokenizer.apply_chat_template`` (same - # path used by PR #2488 and olive-recipes eval scripts), which reads the - # package's ``chat_template.jinja`` and produces the correct chat format - # for every supported model automatically. Setting ``prompt_template`` - # forces the adapter into the legacy hand-templated path. + # Default to None (auto): the ortgenai_mm adapter probes the model's chat + # template once at load and, when it injects media tokens from structured + # content parts (Gemma-4, Qwen2.5-VL, Qwen3-VL), emits the correct + # per-model token automatically — no override needed. Set these only to + # force a specific media token format (e.g. for a template that + # stringifies structured content, like Phi-4-MM). + self.image_token_format = kwargs.get("image_token_format") + self.audio_token_format = kwargs.get("audio_token_format") # HF-only knobs (forwarded to lmms-eval's native wrapper if present). # ``trust_remote_code`` defaults to False to match the rest of Olive # (e.g. olive/common/hf/utils.py, olive/data/component/load_dataset.py) diff --git a/test/evaluator/test_lmms_ort.py b/test/evaluator/test_lmms_ort.py index 630b444cba..3bd52104d6 100644 --- a/test/evaluator/test_lmms_ort.py +++ b/test/evaluator/test_lmms_ort.py @@ -75,6 +75,7 @@ def _make_evaluator_for_prompt_tests(): inst.image_token_format = "<|image_{index}|>" inst.audio_token_format = "<|audio_{index}|>" inst._has_chat_template = True + inst._supports_structured_content = False return inst @@ -145,6 +146,90 @@ def test_build_prompt_for_request_falls_back_when_chat_template_unavailable(): inst._tokenizer.apply_chat_template.assert_not_called() +def test_build_prompt_for_request_uses_structured_content_when_supported_and_not_pinned(): + """Use structured content parts when supported and the user did not pin a token format.""" + inst = _make_evaluator_for_prompt_tests() + inst._supports_structured_content = True + inst.image_token_format = None # auto + inst.audio_token_format = None # auto + inst._tokenizer.apply_chat_template.return_value = "" + + inst._build_prompt_for_request("What is this?", num_images=1, num_audios=1) + + import json as _json + + messages = _json.loads(inst._tokenizer.apply_chat_template.call_args.args[0]) + user_msg = messages[-1] + assert user_msg["role"] == "user" + # Content is a list of typed parts, NOT a pre-rendered string. + assert user_msg["content"] == [ + {"type": "image"}, + {"type": "audio"}, + {"type": "text", "text": "What is this?"}, + ] + + +def test_build_prompt_for_request_pre_renders_when_token_format_pinned(): + """An explicit image_token_format forces the pre-render path despite structured support.""" + inst = _make_evaluator_for_prompt_tests() + inst._supports_structured_content = True + inst.image_token_format = "<|vision_start|>" # user pinned + inst.audio_token_format = None + inst._tokenizer.apply_chat_template.return_value = "" + + inst._build_prompt_for_request("Q", num_images=1, num_audios=0) + + import json as _json + + messages = _json.loads(inst._tokenizer.apply_chat_template.call_args.args[0]) + # Pre-rendered flat string, not structured parts. + assert messages[-1]["content"] == "<|vision_start|>Q" + + +def test_build_prompt_for_request_pre_renders_when_structured_unsupported(): + """Fall back to pre-rendering when the template stringifies structured content (Phi-4-MM).""" + inst = _make_evaluator_for_prompt_tests() + inst._supports_structured_content = False + inst.image_token_format = None # auto + inst.audio_token_format = None + inst._tokenizer.apply_chat_template.return_value = "" + + inst._build_prompt_for_request("Q", num_images=1, num_audios=0) + + import json as _json + + messages = _json.loads(inst._tokenizer.apply_chat_template.call_args.args[0]) + # Falls back to default Phi-4-MM-style pre-rendered markers. + assert messages[-1]["content"] == "<|image_1|>Q" + + +def test_probe_structured_content_support_detects_injection(): + """A template that injects the media token (no dict repr) -> supported.""" + inst = LMMSORTGenAIEvaluator.__new__(LMMSORTGenAIEvaluator) + inst._has_chat_template = True + inst._tokenizer = MagicMock() + inst._tokenizer.apply_chat_template.return_value = "<|im_start|>user\n<|image|>x<|im_end|>" + + assert inst._probe_structured_content_support() is True + + +def test_probe_structured_content_support_detects_stringified_repr(): + """A template that leaks the Python dict repr -> not supported.""" + inst = LMMSORTGenAIEvaluator.__new__(LMMSORTGenAIEvaluator) + inst._has_chat_template = True + inst._tokenizer = MagicMock() + inst._tokenizer.apply_chat_template.return_value = "<|user|>[{'type': 'image'}, ...]<|end|>" + + assert inst._probe_structured_content_support() is False + + +def test_probe_structured_content_support_false_when_no_chat_template(): + inst = LMMSORTGenAIEvaluator.__new__(LMMSORTGenAIEvaluator) + inst._has_chat_template = False + + assert inst._probe_structured_content_support() is False + + def test_lmms_evaluator_converts_lmms_results(tmp_path): model_dir = tmp_path / "model" model_dir.mkdir() @@ -201,7 +286,7 @@ def test_lmms_evaluator_converts_lmms_results(tmp_path): fail_on_error=False, prompt_template="{user_content}", image_token_format="", - audio_token_format="<|audio_{index}|>", + audio_token_format=None, ) simple_evaluate_mock.assert_called_once() assert result.get_value("ai2d_lite", "exact_match") == 0.5