From d25d065394a00c71c70fc9d3d511bc27253e4fe0 Mon Sep 17 00:00:00 2001
From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com>
Date: Fri, 12 Jun 2026 17:55:55 +0000
Subject: [PATCH 1/6] evaluator: add LMMSEvaluator and ORT-GenAI multimodal
 adapter

 Adds LMMSEvaluator (olive/evaluator/olive_evaluator.py) and an
 ORT-GenAI multimodal adapter (olive/evaluator/lmms_ort.py) for
 evaluating multimodal ONNX models via lmms-eval.
---
 olive/evaluator/lmms_ort.py        | 546 +++++++++++++++++++
 olive/evaluator/olive_evaluator.py | 249 +++++++++
 setup.py                           |   3 +
 test/evaluator/test_lmms_ort.py    | 837 +++++++++++++++++++++++++++++
 4 files changed, 1635 insertions(+)
 create mode 100644 olive/evaluator/lmms_ort.py
 create mode 100644 test/evaluator/test_lmms_ort.py

diff --git a/olive/evaluator/lmms_ort.py b/olive/evaluator/lmms_ort.py
new file mode 100644
index 0000000000..63fc67fdb8
--- /dev/null
+++ b/olive/evaluator/lmms_ort.py
@@ -0,0 +1,546 @@
+# -------------------------------------------------------------------------
+# lmms-eval adapter for Olive-exported multimodal models.
+#
+# Added locally (not upstream) to support evaluating quantized multimodal ONNX
+# models through the EvolvingLMMs-Lab/lmms-eval harness, mirroring how
+# olive/evaluator/lmeval_ort.py wraps lm-evaluation-harness for text models.
+#
+# Registers an LMMSORTGenAIEvaluator class with lmms-eval's legacy
+# @register_model registry under the name "ortgenai_mm". Consumers obtain it
+# via lmms_eval.api.registry.get_model("ortgenai_mm").
+# -------------------------------------------------------------------------
+"""lmms-eval ORT-GenAI adapter for Olive-exported multimodal models."""
+
+from __future__ import annotations
+
+import io
+import json
+import logging
+import tempfile
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import PIL.Image
+from tqdm import tqdm
+
+try:
+    import onnxruntime_genai as og
+except ImportError:  # pragma: no cover - optional dep
+    og = None
+
+try:
+    from lmms_eval.api.instance import Instance
+    from lmms_eval.api.model import lmms
+    from lmms_eval.api.registry import register_model
+
+    _LMMS_EVAL_IMPORT_ERROR = None
+except ImportError as e:  # pragma: no cover - optional dep
+    Instance = Any
+    _LMMS_EVAL_IMPORT_ERROR = e
+
+    class lmms:  # noqa: N801
+        pass
+
+    def register_model(_name):
+        def decorator(cls):
+            return cls
+
+        return decorator
+
+
+logger = logging.getLogger(__name__)
+
+
+_PROVIDER_ALIASES = {
+    "cuda": "cuda",
+    "cudaexecutionprovider": "cuda",
+    "gpu": "cuda",
+    "cpu": "cpu",
+    "cpuexecutionprovider": "cpu",
+    "dml": "dml",
+    "dmlexecutionprovider": "dml",
+    "directml": "dml",
+    "webgpu": "webgpu",
+    "webgpuexecutionprovider": "webgpu",
+    "js": "web",
+    "jsexecutionprovider": "web",
+    "nvtensorrtrtx": "NvTensorRtRtx",
+    "nvtensorrtrtxexecutionprovider": "NvTensorRtRtx",
+}
+
+
+# -----------------------------------------------------------------------------
+# Helpers
+# -----------------------------------------------------------------------------
+
+
+def _normalize_image(visual) -> PIL.Image.Image | None:
+    if isinstance(visual, PIL.Image.Image):
+        return visual.convert("RGB")
+    if isinstance(visual, (str, Path)):
+        p = Path(visual)
+        if p.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif"}:
+            return PIL.Image.open(p).convert("RGB")
+        return None
+    if isinstance(visual, dict):
+        if "bytes" in visual:
+            return PIL.Image.open(io.BytesIO(visual["bytes"])).convert("RGB")
+        if "path" in visual:
+            return PIL.Image.open(visual["path"]).convert("RGB")
+    if isinstance(visual, np.ndarray):
+        return PIL.Image.fromarray(np.uint8(visual)).convert("RGB")
+    return None
+
+
+def _normalize_audio(visual) -> tuple[np.ndarray, int] | None:
+    if isinstance(visual, dict) and "array" in visual and "sampling_rate" in visual:
+        return np.asarray(visual["array"], dtype=np.float32), int(visual["sampling_rate"])
+    if isinstance(visual, (str, Path)):
+        p = Path(visual)
+        if p.suffix.lower() in {".wav", ".mp3", ".flac", ".ogg", ".m4a"}:
+            try:
+                import librosa
+            except ImportError:
+                logger.warning("Audio file %s encountered but librosa not installed.", p)
+                return None
+            arr, sr = librosa.load(str(p), sr=None, mono=True)
+            return arr.astype(np.float32), int(sr)
+    return None
+
+
+def _partition_visuals(visuals):
+    images, audios = [], []
+    for v in visuals or []:
+        if v is None:
+            continue
+        img = _normalize_image(v)
+        if img is not None:
+            images.append(img)
+            continue
+        au = _normalize_audio(v)
+        if au is not None:
+            audios.append(au)
+    return images, audios
+
+
+def _format_media_tokens(num_items: int, token_format: str) -> list[str]:
+    return [token_format.format(index=i + 1, zero_index=i) for i in range(num_items)]
+
+
+def _build_prompt(
+    model_type: str,
+    num_images: int,
+    num_audios: int,
+    user_text: str,
+    system_prompt: str = "You are a helpful AI assistant.",
+    prompt_template: str | None = None,
+    image_token_format: str = "<|image_{index}|>",
+    audio_token_format: str = "<|audio_{index}|>",
+) -> str:
+    """Build a Phi-4-multimodal-style chat prompt.
+
+    Other multimodal architectures use different placeholder tags. Users can
+    override the media token formats and the full prompt template from the Olive
+    evaluator config without changing this adapter.
+    """
+    image_tokens = "".join(_format_media_tokens(num_images, image_token_format))
+    audio_tokens = "".join(_format_media_tokens(num_audios, audio_token_format))
+    parts = [image_tokens, audio_tokens, user_text]
+    user_content = "".join(parts)
+    if prompt_template:
+        return prompt_template.format(
+            system_prompt=system_prompt,
+            user_content=user_content,
+            text=user_text,
+            image_tokens=image_tokens,
+            audio_tokens=audio_tokens,
+            model_type=model_type,
+        )
+
+    return f"<|system|>{system_prompt}<|end|><|user|>{user_content}<|end|><|assistant|>"
+
+
+def _normalize_execution_provider(execution_provider: Any | None) -> str:
+    if not execution_provider:
+        return "follow_config"
+    if isinstance(execution_provider, (tuple, list)):
+        execution_provider = execution_provider[0] if execution_provider else None
+    if not execution_provider:
+        return "follow_config"
+    ep = str(execution_provider).lower().replace("_", "")
+    return _PROVIDER_ALIASES.get(ep, str(execution_provider))
+
+
+# -----------------------------------------------------------------------------
+# Main adapter
+# -----------------------------------------------------------------------------
+
+
+@register_model("ortgenai_mm")
+class LMMSORTGenAIEvaluator(lmms):
+    r"""lmms-eval model wrapper for an ORT-GenAI multimodal package.
+
+    Example::
+
+        lmms_eval --model ortgenai_mm \\
+            --model_args pretrained=/path/to/ort_genai_dir,batch_size=1 \\
+            --tasks mmmu_val --limit 4
+    """
+
+    is_simple = True
+
+    def __init__(
+        self,
+        pretrained: str,
+        batch_size: int = 1,
+        max_new_tokens: int = 256,
+        max_length: int = 8192,
+        system_prompt: str = "You are a helpful AI assistant.",
+        execution_provider: str | None = None,
+        provider_options: dict | None = None,
+        fail_on_error: bool = True,
+        prompt_template: str | None = None,
+        image_token_format: str = "<|image_{index}|>",
+        audio_token_format: str = "<|audio_{index}|>",
+        **kwargs,
+    ) -> None:
+        if _LMMS_EVAL_IMPORT_ERROR is not None:
+            raise ImportError(
+                "lmms-eval is required for ortgenai_mm. Install lmms-eval before using LMMSEvaluator."
+            ) from _LMMS_EVAL_IMPORT_ERROR
+        if og is None:
+            raise ImportError(
+                "onnxruntime-genai is required for ortgenai_mm. "
+                "Install with: pip install onnxruntime-genai (or -cuda variant)."
+            )
+        super().__init__()
+        if kwargs:
+            logger.warning("Unused kwargs: %s", kwargs)
+
+        model_dir = Path(pretrained).resolve()
+        if not model_dir.is_dir():
+            raise ValueError(f"ORT-GenAI model directory does not exist: {model_dir}")
+        if not (model_dir / "genai_config.json").is_file():
+            raise ValueError(f"LMMSEvaluator requires genai_config.json in ORT-GenAI package: {model_dir}")
+        if int(batch_size) < 1:
+            raise ValueError("batch_size must be >= 1")
+        if int(max_new_tokens) < 1:
+            raise ValueError("max_new_tokens must be >= 1")
+        if int(max_length) < 1:
+            raise ValueError("max_length must be >= 1")
+
+        self.model_dir = str(model_dir)
+        self.max_new_tokens = int(max_new_tokens)
+        self.max_length = int(max_length)
+        self.batch_size_per_gpu = int(batch_size)
+        self.system_prompt = system_prompt
+        self.fail_on_error = fail_on_error
+        self.prompt_template = prompt_template
+        self.image_token_format = image_token_format
+        self.audio_token_format = audio_token_format
+
+        logger.info("Loading ORT-GenAI model from: %s", self.model_dir)
+        ep = _normalize_execution_provider(execution_provider)
+        # CUDA GenAI packages often carry provider-specific options in genai_config.json.
+        # Clearing/re-adding CUDA can drop those options and fail to load on otherwise
+        # working packages, so follow the package config unless options are overridden.
+        if ep == "follow_config" or (ep == "cuda" and not provider_options):
+            self._model = og.Model(self.model_dir)
+        else:
+            config = og.Config(self.model_dir)
+            config.clear_providers()
+            if ep != "cpu":
+                config.append_provider(ep)
+            for key, value in (provider_options or {}).items():
+                config.set_provider_option(ep, key, value)
+            self._model = og.Model(config)
+        self._tokenizer = og.Tokenizer(self._model)
+        self._processor = self._model.create_multimodal_processor()
+
+        eos_ids = self._tokenizer.eos_token_ids
+        self._eos_token_ids = {int(t) for t in (eos_ids if eos_ids is not None else [])}
+
+        try:
+            cfg = json.loads((Path(self.model_dir) / "genai_config.json").read_text(encoding="utf-8"))
+            self._model_type = cfg.get("model", {}).get("type", "phi4mm")
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid genai_config.json in {self.model_dir}") from e
+
+        self._rank = 0
+        self._world_size = 1
+        logger.info("Model loaded. Model type: %s", self._model_type)
+
+    # -------------------------------------------------------------------------
+    # lmms-eval required properties
+    # -------------------------------------------------------------------------
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    # -------------------------------------------------------------------------
+    # ORT-GenAI input plumbing
+    # -------------------------------------------------------------------------
+    def _build_og_images(self, images, tmp_dir: Path):
+        if not images:
+            return None
+        paths = []
+        for i, img in enumerate(images):
+            path = tmp_dir / f"image_{i}.png"
+            img.save(path, format="PNG")
+            paths.append(str(path))
+        return og.Images.open(*paths)
+
+    def _build_og_audios(self, audios, tmp_dir: Path):
+        if not audios:
+            return None
+        import soundfile as sf
+
+        paths = []
+        for i, (arr, sr) in enumerate(audios):
+            path = tmp_dir / f"audio_{i}.wav"
+            sf.write(path, arr, sr)
+            paths.append(str(path))
+        return og.Audios.open(*paths)
+
+    def _handle_error(self, message: str, exc: Exception, default):
+        if self.fail_on_error:
+            raise RuntimeError(message) from exc
+        logger.exception("%s", message)
+        return default
+
+    # -------------------------------------------------------------------------
+    # Single-request inference primitives
+    # -------------------------------------------------------------------------
+    def _run_generation(
+        self, prompt: str, images, audios, max_new_tokens: int, stop_strings: list[str] | None = None
+    ) -> str:
+        params = og.GeneratorParams(self._model)
+        # `max_length` is total (prompt + completion). Image prompts can be huge
+        # (Phi-4-MM image embeds are 1000+ tokens), so default generously.
+        params.set_search_options(
+            max_length=self.max_length,
+            do_sample=False,
+        )
+        generator = og.Generator(self._model, params)
+
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_dir = Path(tmp)
+            og_images = self._build_og_images(images, tmp_dir)
+            og_audios = self._build_og_audios(audios, tmp_dir)
+
+            try:
+                inputs = self._processor(prompt, images=og_images, audios=og_audios)
+            except Exception as e:  # pragma: no cover
+                del generator
+                return self._handle_error("ORT-GenAI multimodal processor failed.", e, "")
+
+            try:
+                generator.set_inputs(inputs)
+            except RuntimeError as e:
+                del generator
+                return self._handle_error(
+                    "ORT-GenAI generator input setup failed. The prompt may exceed max_length.", e, ""
+                )
+
+            decoded = ""
+            stream = self._tokenizer.create_stream()
+            steps = 0
+            while not generator.is_done() and steps < max_new_tokens:
+                generator.generate_next_token()
+                tok = int(generator.get_next_tokens()[0])
+                if tok in self._eos_token_ids:
+                    break
+                decoded += stream.decode(tok)
+                if stop_strings:
+                    for s in stop_strings:
+                        if s in decoded:
+                            decoded = decoded.split(s, 1)[0]
+                            del generator
+                            return decoded
+                steps += 1
+
+        del generator
+        return decoded
+
+    def _score_continuation(self, prompt: str, continuation: str, images, audios) -> tuple[float, bool]:
+        # Tokenize prompt and prompt+continuation jointly, then slice to obtain
+        # the continuation token IDs as they would actually appear extending the
+        # prompt. Critical for sentencepiece/BPE tokenizers where ``encode("A")``
+        # differs from the suffix of ``encode("prompt A")`` (leading-space
+        # handling, BOS injection, etc.).
+        prompt_tokens = list(self._tokenizer.encode(prompt))
+        full_tokens = list(self._tokenizer.encode(prompt + continuation))
+        cont_tokens = full_tokens[len(prompt_tokens) :]
+        if len(cont_tokens) == 0:
+            return 0.0, True
+
+        params = og.GeneratorParams(self._model)
+        # `max_length` is total (prompt + completion) including image-embed tokens.
+        params.set_search_options(
+            max_length=self.max_length,
+            do_sample=False,
+        )
+        generator = og.Generator(self._model, params)
+
+        with tempfile.TemporaryDirectory() as tmp:
+            tmp_dir = Path(tmp)
+            og_images = self._build_og_images(images, tmp_dir)
+            og_audios = self._build_og_audios(audios, tmp_dir)
+            try:
+                inputs = self._processor(prompt, images=og_images, audios=og_audios)
+            except Exception as e:  # pragma: no cover
+                del generator
+                return self._handle_error("ORT-GenAI multimodal processor failed in loglikelihood.", e, (-1e9, False))
+
+            try:
+                generator.set_inputs(inputs)
+            except RuntimeError as e:
+                del generator
+                return self._handle_error("ORT-GenAI generator input setup failed in loglikelihood.", e, (-1e9, False))
+
+            # ORT-GenAI's ``set_inputs`` only loads the prompt + multimodal embeds;
+            # it does NOT run the decoder forward pass. ``get_logits()`` therefore
+            # returns an undefined buffer before any compute step. Trigger the
+            # prompt-fill forward pass with ``generate_next_token()`` (the sampled
+            # token is discarded via ``rewind_to`` after the first scoring
+            # iteration, before our chosen continuation token is appended).
+            token_count_after_prefill = generator.token_count()
+            generator.generate_next_token()
+
+            total_logprob = 0.0
+            all_greedy = True
+            for i, tok_id in enumerate(cont_tokens):
+                if generator.is_done():
+                    total_logprob += -50.0
+                    all_greedy = False
+                    continue
+                logits = np.asarray(generator.get_logits(), dtype=np.float64).reshape(-1)
+                if tok_id >= logits.shape[0]:
+                    del generator
+                    raise ValueError(f"Token id {tok_id} is outside logits vocabulary size {logits.shape[0]}.")
+                log_denom = np.logaddexp.reduce(logits)
+                total_logprob += float(logits[tok_id] - log_denom)
+                if int(np.argmax(logits)) != tok_id:
+                    all_greedy = False
+                if i == 0:
+                    # Drop the throwaway token sampled by ``generate_next_token``
+                    # above so ``append_tokens`` lands at end-of-prompt + cont[0],
+                    # not end-of-prompt + sampled + cont[0].
+                    generator.rewind_to(token_count_after_prefill)
+                generator.append_tokens(np.array([tok_id], dtype=np.int32))
+
+        del generator
+        return total_logprob, all_greedy
+
+    def _get_doc_and_visuals(self, doc_to_visual, doc_id, task, split):
+        try:
+            doc = self.task_dict[task][split][doc_id]
+        except (KeyError, IndexError, TypeError) as e:
+            raise KeyError(
+                f"Failed to find lmms-eval document task={task!r}, split={split!r}, doc_id={doc_id!r}"
+            ) from e
+
+        visuals = doc_to_visual(doc) if doc_to_visual else []
+        if visuals is None:
+            visuals = []
+        if not isinstance(visuals, list):
+            visuals = [visuals]
+        return doc, visuals
+
+    # -------------------------------------------------------------------------
+    # lmms-eval Model interface
+    # -------------------------------------------------------------------------
+    def generate_until(self, requests: list[Instance], disable_tqdm: bool = False) -> list[str]:
+        results = []
+        pbar = tqdm(total=len(requests), desc="ortgenai_mm generate_until", disable=disable_tqdm)
+        for req in requests:
+            contexts, gen_kwargs, doc_to_visual, doc_id, task, split = req.args
+            _, visuals = self._get_doc_and_visuals(doc_to_visual, doc_id, task, split)
+            images, audios = _partition_visuals(visuals)
+
+            gen_kwargs = gen_kwargs or {}
+            max_new = int(gen_kwargs.get("max_new_tokens", self.max_new_tokens))
+            stop = gen_kwargs.get("until", None)
+            if isinstance(stop, str):
+                stop = [stop]
+
+            prompt = _build_prompt(
+                self._model_type,
+                len(images),
+                len(audios),
+                contexts,
+                self.system_prompt,
+                self.prompt_template,
+                self.image_token_format,
+                self.audio_token_format,
+            )
+            text = self._run_generation(prompt, images, audios, max_new, stop)
+            results.append(text)
+            self.cache_hook.add_partial("generate_until", (contexts, gen_kwargs), text)
+            pbar.update(1)
+        pbar.close()
+        return results
+
+    def loglikelihood(self, requests: list[Instance], disable_tqdm: bool = False) -> list[tuple[float, bool]]:
+        results = []
+        pbar = tqdm(total=len(requests), desc="ortgenai_mm loglikelihood", disable=disable_tqdm)
+        for req in requests:
+            contexts, doc_to_target, doc_to_visual, doc_id, task, split = req.args
+            doc, visuals = self._get_doc_and_visuals(doc_to_visual, doc_id, task, split)
+            images, audios = _partition_visuals(visuals)
+            continuation = str(doc_to_target(doc))
+
+            prompt = _build_prompt(
+                self._model_type,
+                len(images),
+                len(audios),
+                contexts,
+                self.system_prompt,
+                self.prompt_template,
+                self.image_token_format,
+                self.audio_token_format,
+            )
+            logprob, is_greedy = self._score_continuation(prompt, continuation, images, audios)
+            results.append((logprob, is_greedy))
+            self.cache_hook.add_partial("loglikelihood", (contexts, continuation), (logprob, is_greedy))
+            pbar.update(1)
+        pbar.close()
+        return results
+
+    def generate_until_multi_round(self, requests) -> list[str]:
+        raise NotImplementedError("ortgenai_mm does not support lmms-eval multi-round generation yet.")
+
+
+# -----------------------------------------------------------------------------
+# lmms-eval MODEL_REGISTRY_V2 entry-point factory.
+#
+# Exposed via setup.py entry_points["lmms_eval.models"], so a fresh install of
+# olive-ai makes ``--model ortgenai_mm`` discoverable from the lmms-eval CLI
+# (e.g. ``python -m lmms_eval --model ortgenai_mm ...``) without requiring the
+# caller to import this module first.
+#
+# lmms-eval's ``ModelRegistryV2.load_entrypoint_manifests`` accepts a
+# ``Callable`` payload, so we keep the import of ``ModelManifest`` lazy. That
+# way ``olive`` (and the rest of this module) stays importable when lmms-eval
+# is not installed.
+# -----------------------------------------------------------------------------
+def _model_manifest():
+    """Return the lmms-eval ModelManifest for ``ortgenai_mm``.
+
+    Used as an entry-point payload for lmms-eval's MODEL_REGISTRY_V2.
+    """
+    from lmms_eval.models.registry_v2 import ModelManifest
+
+    return ModelManifest(
+        model_id="ortgenai_mm",
+        simple_class_path="olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator",
+    )
diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py
index 96378de129..af73fabe64 100644
--- a/olive/evaluator/olive_evaluator.py
+++ b/olive/evaluator/olive_evaluator.py
@@ -2010,6 +2010,255 @@ def evaluate(
         return flatten_metric_result(metrics)
 
 
+@Registry.register("LMMSEvaluator")
+class LMMSEvaluator(OliveEvaluator):
+    """Evaluator for multimodal models using lmms-eval (EvolvingLMMs-Lab/lmms-eval).
+
+    Supports two model handler types:
+
+    1. :class:`ONNXModelHandler` whose path is an ORT-GenAI multimodal package
+       (directory containing ``genai_config.json`` plus quantized ONNX files,
+       typically produced by ``MobiusBuilder`` + ``OnnxKQuantQuantization``).
+       Dispatches to the ``ortgenai_mm`` adapter in :mod:`olive.evaluator.lmms_ort`.
+
+    2. :class:`HfModelHandler` for HuggingFace PyTorch multimodal models.
+       Dispatches to lmms-eval's native wrapper for the model architecture
+       (e.g. ``phi4_multimodal``, ``qwen2_5_vl``). The wrapper is auto-detected
+       from the HF ``model_type`` field; pass ``model_class`` in the recipe to
+       override.
+
+    Raw single-file ONNX models are intentionally not supported: multimodal
+    inference requires the vision/audio preprocessing pipeline that ORT-GenAI's
+    multimodal processor provides; a bare ``onnxruntime.InferenceSession``
+    cannot do image or audio tokenization on its own.
+
+    Example recipe config::
+
+        "evaluators": {
+            "evaluator": {
+                "type": "LMMSEvaluator",
+                "tasks": ["ai2d_lite", "ocrbench"],
+                "batch_size": 1,
+                "limit": 4
+            }
+        },
+        "evaluator": "evaluator"
+    """
+
+    # HuggingFace model_type -> lmms-eval model_class (canonical id).
+    # Covers the multimodal architectures most relevant to Olive sweeps; other
+    # architectures still work if ``model_class`` is set explicitly in the
+    # recipe. Verified against lmms-eval's AVAILABLE_SIMPLE_MODELS /
+    # AVAILABLE_CHAT_TEMPLATE_MODELS registries.
+    _HF_MODEL_TYPE_TO_LMMS_CLASS: ClassVar[dict[str, str]] = {
+        "phi4mm": "phi4_multimodal",
+        "phi3_v": "phi3v",
+        "qwen2_vl": "qwen2_vl",
+        "qwen2_5_vl": "qwen2_5_vl",
+        "qwen3_vl": "qwen3_vl",
+        "qwen2_audio": "qwen2_audio",
+        "qwen2_5_omni": "qwen2_5_omni",
+        "qwen3_omni": "qwen3_omni",
+        "whisper": "whisper",
+        "gemma3": "gemma3",
+        "minicpm_o": "minicpm_o",
+        "llava": "llava",
+        "llava_onevision": "llava_onevision",
+        "internvl_chat": "internvl",
+    }
+
+    def __init__(self, tasks: list[str], **kwargs):
+        super().__init__(**kwargs)
+        self.tasks = tasks
+        self.limit = kwargs.get("limit")
+        self.model_class = kwargs.get("model_class")
+        self.batch_size = kwargs.get("batch_size", 1)
+        self.max_new_tokens = kwargs.get("max_new_tokens", 256)
+        self.max_length = kwargs.get("max_length", 32768)
+        self.system_prompt = kwargs.get("system_prompt", "You are a helpful AI assistant.")
+        self.ep = kwargs.get("execution_provider")
+        self.ep_options = kwargs.get("provider_options")
+        self.log_samples = bool(kwargs.get("log_samples", False))
+        self.output_path = kwargs.get("output_path")
+        self.fail_on_error = bool(kwargs.get("fail_on_error", True))
+        self.prompt_template = kwargs.get("prompt_template")
+        self.image_token_format = kwargs.get("image_token_format", "<|image_{index}|>")
+        self.audio_token_format = kwargs.get("audio_token_format", "<|audio_{index}|>")
+        # HF-only knobs (forwarded to lmms-eval's native wrapper if present).
+        # ``trust_remote_code`` defaults to False to match the rest of Olive
+        # (e.g. olive/common/hf/utils.py, olive/data/component/load_dataset.py)
+        # and avoid silently executing arbitrary Hub code at load time. Users
+        # who need Phi-4-MM, MiniCPM-o, etc. opt in explicitly via the recipe.
+        self.dtype = kwargs.get("dtype", "auto")
+        self.trust_remote_code = bool(kwargs.get("trust_remote_code", False))
+        self.hf_model_kwargs = kwargs.get("hf_model_kwargs") or {}
+
+    @staticmethod
+    def _resolve_model_dir(model: ONNXModelHandler) -> Path:
+        model_path = Path(model.model_path)
+        return model_path if model_path.is_dir() else model_path.parent
+
+    @staticmethod
+    def _resolve_execution_provider(execution_providers: Optional[Union[str, list[str]]]):
+        if not execution_providers:
+            return None
+        if isinstance(execution_providers, list):
+            return execution_providers[0] if execution_providers else None
+        return execution_providers
+
+    @staticmethod
+    def _device_for_hf(device: Device) -> str:
+        # lmms-eval's HF wrappers accept "cuda", "cpu", or a torch.device.
+        return "cuda" if device == Device.GPU else "cpu"
+
+    def _build_ortgenai_mm_lm(
+        self,
+        model: ONNXModelHandler,
+        execution_providers: Optional[Union[str, list[str]]],
+    ):
+        from olive.evaluator.lmms_ort import LMMSORTGenAIEvaluator
+
+        genai_config = self._resolve_model_dir(model) / "genai_config.json"
+        if not genai_config.exists():
+            raise ValueError(
+                "LMMSEvaluator requires an ORT-GenAI package "
+                "(directory containing genai_config.json) for ONNXModelHandler input. "
+                f"Got ONNXModelHandler without genai_config at {model.model_path}. "
+                "Raw single-file ONNX is not supported for multimodal evaluation because "
+                "the vision/audio preprocessing pipeline lives in ORT-GenAI's multimodal "
+                "processor; use HfModelHandler or an ORT-GenAI package instead."
+            )
+
+        model_dir = str(self._resolve_model_dir(model))
+        logger.info("Running lmms-eval (model_class=ortgenai_mm, model_dir=%s)", model_dir)
+        return LMMSORTGenAIEvaluator(
+            pretrained=model_dir,
+            batch_size=self.batch_size,
+            max_new_tokens=self.max_new_tokens,
+            max_length=self.max_length,
+            system_prompt=self.system_prompt,
+            execution_provider=self.ep or self._resolve_execution_provider(execution_providers),
+            provider_options=self.ep_options,
+            fail_on_error=self.fail_on_error,
+            prompt_template=self.prompt_template,
+            image_token_format=self.image_token_format,
+            audio_token_format=self.audio_token_format,
+        )
+
+    def _resolve_hf_model_class(self, model: HfModelHandler) -> str:
+        if self.model_class:
+            return self.model_class
+        hf_model_type = model.get_hf_model_type()
+        lmms_class = self._HF_MODEL_TYPE_TO_LMMS_CLASS.get(hf_model_type)
+        if not lmms_class:
+            raise ValueError(
+                f"Could not auto-detect lmms-eval model_class for HF model_type={hf_model_type!r}. "
+                f"Pass 'model_class' in the evaluator config (e.g. one of "
+                f"{sorted(self._HF_MODEL_TYPE_TO_LMMS_CLASS.values())}, or any other "
+                f"name registered with lmms-eval)."
+            )
+        return lmms_class
+
+    def _build_hf_lm(self, model: HfModelHandler, device: Device):
+        import inspect
+
+        from lmms_eval.models import get_model as lmms_get_model
+
+        model_class = self._resolve_hf_model_class(model)
+        lm_cls = lmms_get_model(model_class)
+
+        # lmms-eval wrappers have inconsistent constructor signatures: phi4_multimodal
+        # accepts dtype/trust_remote_code as named params, qwen2_5_vl asserts
+        # ``kwargs == {}`` at runtime even though it has ``**kwargs`` in its signature.
+        # Because the signature alone cannot tell "absorbs unknown kwargs" from
+        # "rejects unknown kwargs at runtime", we conservatively only forward
+        # ``device/dtype/trust_remote_code`` to wrappers that name them explicitly
+        # as parameters. For wrappers that take a different set of kwargs (e.g.
+        # backend-specific knobs), users pass them through ``hf_model_kwargs``,
+        # which is always forwarded unfiltered as an explicit user opt-in.
+        try:
+            accepted = set(inspect.signature(lm_cls.__init__).parameters)
+        except (TypeError, ValueError):
+            accepted = set()
+
+        optional_kwargs = {
+            "device": self._device_for_hf(device),
+            "dtype": self.dtype,
+            "trust_remote_code": self.trust_remote_code,
+        }
+        forwarded = {k: v for k, v in optional_kwargs.items() if k in accepted}
+
+        init_kwargs = {
+            "pretrained": str(model.model_name_or_path),
+            "batch_size": self.batch_size,
+            **forwarded,
+            **self.hf_model_kwargs,
+        }
+        logger.info(
+            "Running lmms-eval (model_class=%s, pretrained=%s, forwarded_kwargs=%s)",
+            model_class,
+            init_kwargs["pretrained"],
+            sorted(set(init_kwargs) - {"pretrained", "batch_size"}),
+        )
+        return lm_cls(**init_kwargs)
+
+    def evaluate(
+        self,
+        model: "OliveModelHandler",
+        metrics: list[Metric],
+        device: Device = Device.CPU,
+        execution_providers: Optional[Union[str, list[str]]] = None,
+    ) -> MetricResult:
+        from lmms_eval.evaluator import simple_evaluate
+
+        if isinstance(model, ONNXModelHandler):
+            lm = self._build_ortgenai_mm_lm(model, execution_providers)
+        elif isinstance(model, HfModelHandler):
+            lm = self._build_hf_lm(model, device)
+        else:
+            raise ValueError(
+                "LMMSEvaluator supports ONNXModelHandler (ORT-GenAI multimodal package) "
+                f"and HfModelHandler. Got {type(model).__name__}."
+            )
+
+        results = simple_evaluate(
+            model=lm,
+            tasks=self.tasks,
+            batch_size=self.batch_size,
+            limit=self.limit,
+            log_samples=self.log_samples,
+        )
+
+        if self.output_path:
+            import json as _json
+
+            out = Path(self.output_path)
+            out.parent.mkdir(parents=True, exist_ok=True)
+            compact = {
+                "results": results.get("results", {}),
+                "configs": {k: str(v) for k, v in results.get("configs", {}).items()},
+            }
+            out.write_text(_json.dumps(compact, indent=2, default=str), encoding="utf-8")
+            logger.info("Wrote lmms-eval results to %s", out)
+
+        # Convert lmms-eval results into Olive's MetricResult shape (mirrors LMEvaluator)
+        metrics_dict: dict[str, MetricResult] = {}
+        for task_name in sorted(results.get("results", {}).keys()):
+            task_results = results["results"][task_name]
+            task_metrics = {}
+            for mf, v in sorted(task_results.items()):
+                if mf == "alias" or not isinstance(v, (int, float)):
+                    continue
+                m, _, _ = mf.partition(",")
+                if m.endswith("_stderr"):
+                    continue
+                task_metrics[m] = SubMetricResult(value=float(v), priority=-1, higher_is_better=True)
+            if task_metrics:
+                metrics_dict[task_name] = MetricResult.model_validate(task_metrics)
+
+        return flatten_metric_result(metrics_dict)
+
+
 @Registry.register("MTEBEvaluator")
 class MTEBEvaluator(OliveEvaluator):
     """Evaluator for embedding models using the MTEB (Massive Text Embedding Benchmark) library.
diff --git a/setup.py b/setup.py
index b4aebf070a..798010301d 100644
--- a/setup.py
+++ b/setup.py
@@ -88,5 +88,8 @@ def get_extra_deps(rel_path):
     data_files=[],
     entry_points={
         "console_scripts": ["olive=olive.cli.launcher:main"],
+        "lmms_eval.models": [
+            "ortgenai_mm = olive.evaluator.lmms_ort:_model_manifest",
+        ],
     },
 )
diff --git a/test/evaluator/test_lmms_ort.py b/test/evaluator/test_lmms_ort.py
new file mode 100644
index 0000000000..408be890ad
--- /dev/null
+++ b/test/evaluator/test_lmms_ort.py
@@ -0,0 +1,837 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+# Tests intentionally exercise "protected" runtime methods (_score_continuation,
+# _run_generation) and configure fake collaborators by setting attributes
+# directly on the fake. Both are normal in unit tests, so suppress pylint's
+# protected-access / attribute-defined-outside-init warnings for this file.
+# pylint: disable=protected-access,attribute-defined-outside-init
+import sys
+from types import ModuleType, SimpleNamespace
+from typing import ClassVar
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+import PIL.Image
+import pytest
+
+from olive.evaluator.lmms_ort import (
+    LMMSORTGenAIEvaluator,
+    _build_prompt,
+    _model_manifest,
+    _normalize_execution_provider,
+)
+from olive.evaluator.olive_evaluator import LMMSEvaluator
+from olive.model import ONNXModelHandler
+
+
+def test_build_prompt_uses_default_phi4mm_tokens():
+    prompt = _build_prompt("phi4mm", 1, 1, "What happened?", "System prompt.")
+
+    assert prompt == "<|system|>System prompt.<|end|><|user|><|image_1|><|audio_1|>What happened?<|end|><|assistant|>"
+
+
+def test_build_prompt_uses_custom_template_and_token_formats():
+    prompt = _build_prompt(
+        "custom",
+        2,
+        1,
+        "Question",
+        "System",
+        prompt_template="{system_prompt}\n{image_tokens}{audio_tokens}\n{text}",
+        image_token_format="<image:{zero_index}>",
+        audio_token_format="<audio:{index}>",
+    )
+
+    assert prompt == "System\n<image:0><image:1><audio:1>\nQuestion"
+
+
+@pytest.mark.parametrize(
+    ("execution_provider", "expected"),
+    [
+        ("CUDAExecutionProvider", "cuda"),
+        ("CPUExecutionProvider", "cpu"),
+        ("DmlExecutionProvider", "dml"),
+        ("gpu", "cuda"),
+        (None, "follow_config"),
+        (("CUDAExecutionProvider", {"device_id": "0"}), "cuda"),
+    ],
+)
+def test_normalize_execution_provider(execution_provider, expected):
+    assert _normalize_execution_provider(execution_provider) == expected
+
+
+def test_lmms_evaluator_converts_lmms_results(tmp_path):
+    model_dir = tmp_path / "model"
+    model_dir.mkdir()
+    model_path = model_dir / "text.onnx"
+    model_path.touch()
+    (model_dir / "genai_config.json").write_text('{"model": {"type": "phi4mm"}}', encoding="utf-8")
+
+    output_path = tmp_path / "results.json"
+    evaluator = LMMSEvaluator(
+        tasks=["ai2d_lite"],
+        batch_size=1,
+        limit=2,
+        output_path=str(output_path),
+        fail_on_error=False,
+        prompt_template="{user_content}",
+        image_token_format="<image>",
+    )
+    model = ONNXModelHandler(model_path=str(model_path))
+
+    simple_evaluate_result = {
+        "results": {
+            "ai2d_lite": {
+                "alias": "AI2D Lite",
+                "exact_match,none": 0.5,
+                "exact_match_stderr,none": 0.1,
+                "samples": [{"ignored": True}],
+            }
+        },
+        "configs": {"ai2d_lite": {"task": "ai2d_lite"}},
+    }
+
+    simple_evaluate_mock = MagicMock(return_value=simple_evaluate_result)
+    lmms_eval_module = ModuleType("lmms_eval")
+    lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator")
+    lmms_eval_evaluator_module.simple_evaluate = simple_evaluate_mock
+
+    with (
+        patch.dict(
+            sys.modules,
+            {"lmms_eval": lmms_eval_module, "lmms_eval.evaluator": lmms_eval_evaluator_module},
+        ),
+        patch("olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator", return_value=SimpleNamespace()) as lm_mock,
+    ):
+        result = evaluator.evaluate(model, [], execution_providers=["CUDAExecutionProvider"])
+
+    lm_mock.assert_called_once_with(
+        pretrained=str(model_dir),
+        batch_size=1,
+        max_new_tokens=256,
+        max_length=32768,
+        system_prompt="You are a helpful AI assistant.",
+        execution_provider="CUDAExecutionProvider",
+        provider_options=None,
+        fail_on_error=False,
+        prompt_template="{user_content}",
+        image_token_format="<image>",
+        audio_token_format="<|audio_{index}|>",
+    )
+    simple_evaluate_mock.assert_called_once()
+    assert result.get_value("ai2d_lite", "exact_match") == 0.5
+    assert output_path.exists()
+
+
+def test_lmms_evaluator_requires_genai_config(tmp_path):
+    model_path = tmp_path / "text.onnx"
+    model_path.touch()
+    evaluator = LMMSEvaluator(tasks=["ai2d_lite"])
+    model = ONNXModelHandler(model_path=str(model_path))
+
+    with pytest.raises(ValueError, match="requires an ORT-GenAI package"):
+        evaluator.evaluate(model, [])
+
+
+# -----------------------------------------------------------------------------
+# HuggingFace dispatch
+# -----------------------------------------------------------------------------
+
+
+def _make_hf_model_handler_stub(model_name_or_path: str, hf_model_type: str):
+    handler = MagicMock(name="HfModelHandler")
+    handler.model_name_or_path = model_name_or_path
+    handler.get_hf_model_type.return_value = hf_model_type
+    return handler
+
+
+def _patch_isinstance_for_hf(handler_stub, monkeypatch):
+    """Force isinstance(handler_stub, HfModelHandler) to True for the test path.
+
+    Avoids constructing a real HfModelHandler (which would require a real HF
+    model on disk) while still exercising the dispatch logic.
+    """
+    import olive.evaluator.olive_evaluator as oe
+
+    real_isinstance = isinstance
+
+    def _isinstance(obj, cls):
+        if obj is handler_stub and cls is oe.HfModelHandler:
+            return True
+        if obj is handler_stub and cls is oe.ONNXModelHandler:
+            return False
+        return real_isinstance(obj, cls)
+
+    monkeypatch.setattr(oe, "isinstance", _isinstance, raising=False)
+    oe.isinstance = _isinstance
+
+
+class _FakePhi4Wrapper:
+    """Fake lmms-eval wrapper with a phi4_multimodal-style signature.
+
+    Accepts dtype + trust_remote_code (mirrors lmms-eval's phi4_multimodal class).
+    """
+
+    last_kwargs: ClassVar[dict] = {}
+
+    def __init__(self, pretrained, device="cuda", dtype="auto", batch_size=1, trust_remote_code=True, **kwargs):
+        type(self).last_kwargs = {
+            "pretrained": pretrained,
+            "device": device,
+            "dtype": dtype,
+            "batch_size": batch_size,
+            "trust_remote_code": trust_remote_code,
+            **kwargs,
+        }
+
+
+class _FakeQwenWrapper:
+    """Fake lmms-eval wrapper with a qwen2_5_vl-style signature.
+
+    Does NOT accept dtype or trust_remote_code (mirrors lmms-eval's qwen2_5_vl
+    class which asserts kwargs == {}).
+    """
+
+    last_kwargs: ClassVar[dict] = {}
+
+    def __init__(self, pretrained, device="cuda", device_map="auto", batch_size=1, **kwargs):
+        if kwargs:
+            raise AssertionError(f"Unexpected kwargs: {kwargs}")
+        type(self).last_kwargs = {
+            "pretrained": pretrained,
+            "device": device,
+            "device_map": device_map,
+            "batch_size": batch_size,
+        }
+
+
+class _FakeKwargsWrapper:
+    """Fake lmms-eval wrapper that absorbs ALL options via ``**kwargs``.
+
+    Mirrors lmms-eval wrappers (and HF model wrappers more generally) that take
+    only the required ``pretrained`` argument by name and pass everything else
+    through ``**kwargs`` to the underlying HF transformers model. Used to verify
+    LMMSEvaluator forwards optional kwargs (dtype, trust_remote_code, device) to
+    such wrappers instead of silently dropping them because they aren't in
+    ``inspect.signature(...).parameters`` as named params.
+    """
+
+    last_kwargs: ClassVar[dict] = {}
+
+    def __init__(self, pretrained, **kwargs):
+        type(self).last_kwargs = {"pretrained": pretrained, **kwargs}
+
+
+def test_lmms_evaluator_auto_detects_hf_model_class_from_model_type(tmp_path, monkeypatch):
+    """When model_class is unset, auto-detect from HfModelHandler.get_hf_model_type()."""
+    handler_stub = _make_hf_model_handler_stub("/local/path/Phi-4-multimodal-instruct", "phi4mm")
+    _patch_isinstance_for_hf(handler_stub, monkeypatch)
+
+    output_path = tmp_path / "results.json"
+    evaluator = LMMSEvaluator(tasks=["ai2d_lite"], batch_size=2, limit=4, output_path=str(output_path))
+
+    simple_evaluate_mock = MagicMock(
+        return_value={"results": {"ai2d_lite": {"alias": "AI2D", "exact_match,none": 0.75}}, "configs": {}}
+    )
+    lmms_eval_module = ModuleType("lmms_eval")
+    lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator")
+    lmms_eval_evaluator_module.simple_evaluate = simple_evaluate_mock
+    lmms_eval_models_module = ModuleType("lmms_eval.models")
+    lmms_eval_models_module.get_model = MagicMock(return_value=_FakePhi4Wrapper)
+
+    with patch.dict(
+        sys.modules,
+        {
+            "lmms_eval": lmms_eval_module,
+            "lmms_eval.evaluator": lmms_eval_evaluator_module,
+            "lmms_eval.models": lmms_eval_models_module,
+        },
+    ):
+        result = evaluator.evaluate(handler_stub, [])
+
+    lmms_eval_models_module.get_model.assert_called_once_with("phi4_multimodal")
+    assert _FakePhi4Wrapper.last_kwargs["pretrained"] == "/local/path/Phi-4-multimodal-instruct"
+    assert _FakePhi4Wrapper.last_kwargs["batch_size"] == 2
+    # trust_remote_code defaults to False (see olive/evaluator/olive_evaluator.py
+    # LMMSEvaluator.__init__); users opt in explicitly in the recipe.
+    assert _FakePhi4Wrapper.last_kwargs["trust_remote_code"] is False
+    assert _FakePhi4Wrapper.last_kwargs["dtype"] == "auto"
+    simple_evaluate_mock.assert_called_once()
+    assert result.get_value("ai2d_lite", "exact_match") == 0.75
+
+
+def test_lmms_evaluator_filters_kwargs_for_qwen_style_wrapper(monkeypatch):
+    """Wrappers like qwen2_5_vl reject unknown kwargs.
+
+    LMMSEvaluator must inspect the wrapper signature and only forward kwargs
+    the wrapper actually declares as named parameters.
+    """
+    handler_stub = _make_hf_model_handler_stub("/p/Qwen2.5-VL-3B-Instruct", "qwen2_5_vl")
+    _patch_isinstance_for_hf(handler_stub, monkeypatch)
+
+    evaluator = LMMSEvaluator(tasks=["mmstar"], batch_size=1, dtype="bfloat16", trust_remote_code=True)
+
+    simple_evaluate_mock = MagicMock(return_value={"results": {}, "configs": {}})
+    lmms_eval_module = ModuleType("lmms_eval")
+    lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator")
+    lmms_eval_evaluator_module.simple_evaluate = simple_evaluate_mock
+    lmms_eval_models_module = ModuleType("lmms_eval.models")
+    lmms_eval_models_module.get_model = MagicMock(return_value=_FakeQwenWrapper)
+
+    with patch.dict(
+        sys.modules,
+        {
+            "lmms_eval": lmms_eval_module,
+            "lmms_eval.evaluator": lmms_eval_evaluator_module,
+            "lmms_eval.models": lmms_eval_models_module,
+        },
+    ):
+        evaluator.evaluate(handler_stub, [])
+
+    # dtype + trust_remote_code must NOT have been forwarded (Qwen wrapper would error)
+    assert "dtype" not in _FakeQwenWrapper.last_kwargs
+    assert "trust_remote_code" not in _FakeQwenWrapper.last_kwargs
+    # but pretrained, device, batch_size MUST have been forwarded
+    assert _FakeQwenWrapper.last_kwargs["pretrained"] == "/p/Qwen2.5-VL-3B-Instruct"
+    assert _FakeQwenWrapper.last_kwargs["device"] == "cpu"  # Device.CPU default
+    assert _FakeQwenWrapper.last_kwargs["batch_size"] == 1
+
+
+def test_lmms_evaluator_does_not_forward_to_pure_var_keyword_wrappers(monkeypatch):
+    """Verify ``device``/``dtype``/``trust_remote_code`` are NOT auto-forwarded to ``**kwargs`` wrappers.
+
+    Rationale: the signature alone cannot tell "absorbs unknowns" from "rejects
+    unknowns at runtime" (qwen2_5_vl has ``**kwargs`` and asserts ``kwargs ==
+    {}``). To stay safe, only kwargs named explicitly as parameters are
+    auto-forwarded. Users who want to forward additional kwargs to a pure
+    ``**kwargs`` wrapper must use ``hf_model_kwargs`` (explicit user opt-in).
+    """
+    handler_stub = _make_hf_model_handler_stub("/p/some-vlm", "qwen2_5_vl")
+    _patch_isinstance_for_hf(handler_stub, monkeypatch)
+
+    evaluator = LMMSEvaluator(
+        tasks=["mmstar"],
+        batch_size=1,
+        dtype="bfloat16",
+        trust_remote_code=True,
+        # The escape hatch for forwarding arbitrary kwargs to a wrapper that
+        # absorbs them via **kwargs:
+        hf_model_kwargs={"custom_backend_opt": "value"},
+    )
+
+    simple_evaluate_mock = MagicMock(return_value={"results": {}, "configs": {}})
+    lmms_eval_module = ModuleType("lmms_eval")
+    lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator")
+    lmms_eval_evaluator_module.simple_evaluate = simple_evaluate_mock
+    lmms_eval_models_module = ModuleType("lmms_eval.models")
+    lmms_eval_models_module.get_model = MagicMock(return_value=_FakeKwargsWrapper)
+
+    with patch.dict(
+        sys.modules,
+        {
+            "lmms_eval": lmms_eval_module,
+            "lmms_eval.evaluator": lmms_eval_evaluator_module,
+            "lmms_eval.models": lmms_eval_models_module,
+        },
+    ):
+        evaluator.evaluate(handler_stub, [])
+
+    # Required kwargs are always forwarded.
+    assert _FakeKwargsWrapper.last_kwargs["pretrained"] == "/p/some-vlm"
+    assert _FakeKwargsWrapper.last_kwargs["batch_size"] == 1
+    # Optional kwargs are NOT forwarded to pure **kwargs wrappers.
+    assert "dtype" not in _FakeKwargsWrapper.last_kwargs
+    assert "trust_remote_code" not in _FakeKwargsWrapper.last_kwargs
+    assert "device" not in _FakeKwargsWrapper.last_kwargs
+    # The explicit hf_model_kwargs escape hatch IS forwarded.
+    assert _FakeKwargsWrapper.last_kwargs["custom_backend_opt"] == "value"
+
+
+def test_lmms_evaluator_uses_explicit_model_class_when_set(monkeypatch):
+    """An explicit ``model_class`` in the recipe overrides auto-detection."""
+    handler_stub = _make_hf_model_handler_stub("/p/some-vlm", "some-unknown-vlm-type")
+    _patch_isinstance_for_hf(handler_stub, monkeypatch)
+
+    evaluator = LMMSEvaluator(tasks=["mmstar"], model_class="qwen2_5_vl", batch_size=1)
+
+    simple_evaluate_mock = MagicMock(return_value={"results": {}, "configs": {}})
+    lmms_eval_module = ModuleType("lmms_eval")
+    lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator")
+    lmms_eval_evaluator_module.simple_evaluate = simple_evaluate_mock
+    lmms_eval_models_module = ModuleType("lmms_eval.models")
+    lmms_eval_models_module.get_model = MagicMock(return_value=_FakeQwenWrapper)
+
+    with patch.dict(
+        sys.modules,
+        {
+            "lmms_eval": lmms_eval_module,
+            "lmms_eval.evaluator": lmms_eval_evaluator_module,
+            "lmms_eval.models": lmms_eval_models_module,
+        },
+    ):
+        evaluator.evaluate(handler_stub, [])
+
+    lmms_eval_models_module.get_model.assert_called_once_with("qwen2_5_vl")
+    handler_stub.get_hf_model_type.assert_not_called()
+
+
+def test_lmms_evaluator_raises_when_hf_model_type_is_unmapped(monkeypatch):
+    """If we can't auto-detect and the user didn't set model_class, fail loudly."""
+    handler_stub = _make_hf_model_handler_stub("/p/exotic-model", "some-exotic-vlm")
+    _patch_isinstance_for_hf(handler_stub, monkeypatch)
+
+    evaluator = LMMSEvaluator(tasks=["mmstar"])
+
+    # Even with lmms_eval modules mocked, the error fires before reaching them.
+    lmms_eval_module = ModuleType("lmms_eval")
+    lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator")
+    lmms_eval_evaluator_module.simple_evaluate = MagicMock()
+    lmms_eval_models_module = ModuleType("lmms_eval.models")
+    lmms_eval_models_module.get_model = MagicMock()
+    with (
+        patch.dict(
+            sys.modules,
+            {
+                "lmms_eval": lmms_eval_module,
+                "lmms_eval.evaluator": lmms_eval_evaluator_module,
+                "lmms_eval.models": lmms_eval_models_module,
+            },
+        ),
+        pytest.raises(ValueError, match=r"Could not auto-detect lmms-eval model_class"),
+    ):
+        evaluator.evaluate(handler_stub, [])
+
+
+def test_lmms_evaluator_rejects_unsupported_handler_type():
+    """LMMSEvaluator only supports HfModelHandler and ONNXModelHandler-as-ortgenai."""
+    evaluator = LMMSEvaluator(tasks=["mmstar"])
+    bogus = SimpleNamespace()  # neither HfModelHandler nor ONNXModelHandler
+
+    lmms_eval_module = ModuleType("lmms_eval")
+    lmms_eval_evaluator_module = ModuleType("lmms_eval.evaluator")
+    lmms_eval_evaluator_module.simple_evaluate = MagicMock()
+    with (
+        patch.dict(sys.modules, {"lmms_eval": lmms_eval_module, "lmms_eval.evaluator": lmms_eval_evaluator_module}),
+        pytest.raises(ValueError, match=r"ONNXModelHandler.*HfModelHandler"),
+    ):
+        evaluator.evaluate(bogus, [])
+
+
+# -----------------------------------------------------------------------------
+# lmms-eval MODEL_REGISTRY_V2 entry-point integration
+# -----------------------------------------------------------------------------
+
+
+def test_lmms_ort_genai_evaluator_is_simple_flag_matches_registration():
+    """Verify is_simple matches the lmms-eval registration type.
+
+    MODEL_REGISTRY_V2._validate_model_class requires the class' ``is_simple``
+    flag to match the registered model_type (``simple`` vs ``chat``). Our
+    adapter is registered with ``simple_class_path``, so ``is_simple`` must
+    be ``True``.
+    """
+    assert LMMSORTGenAIEvaluator.is_simple is True
+
+
+def test_model_manifest_factory_returns_expected_manifest():
+    """Verify the entry-point payload points at LMMSORTGenAIEvaluator."""
+    pytest.importorskip("lmms_eval")
+
+    manifest = _model_manifest()
+
+    assert manifest.model_id == "ortgenai_mm"
+    assert manifest.simple_class_path == "olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator"
+    assert manifest.chat_class_path is None
+
+
+def test_olive_ai_registers_ortgenai_mm_entry_point():
+    """Verify olive-ai exposes ortgenai_mm via the lmms_eval.models entry-point group."""
+    from importlib.metadata import entry_points
+
+    eps = {ep.name: ep.value for ep in entry_points(group="lmms_eval.models")}
+    assert eps.get("ortgenai_mm") == "olive.evaluator.lmms_ort:_model_manifest"
+
+
+def test_model_registry_v2_resolves_ortgenai_mm():
+    """Verify lmms-eval's MODEL_REGISTRY_V2 resolves ortgenai_mm via the entry point."""
+    pytest.importorskip("lmms_eval")
+    from lmms_eval.models import MODEL_REGISTRY_V2
+
+    resolved = MODEL_REGISTRY_V2.resolve("ortgenai_mm")
+    assert resolved.model_id == "ortgenai_mm"
+    assert resolved.model_type == "simple"
+    assert resolved.class_path == "olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator"
+
+    cls = MODEL_REGISTRY_V2.get_model_class("ortgenai_mm")
+    assert cls is LMMSORTGenAIEvaluator
+
+
+# -----------------------------------------------------------------------------
+# Visual partitioning
+# -----------------------------------------------------------------------------
+
+
+def test_partition_visuals_separates_images_and_audios_and_skips_nones():
+    from olive.evaluator.lmms_ort import _partition_visuals
+
+    img1 = PIL.Image.new("RGB", (4, 4))
+    img2_dict = {"bytes": _png_bytes(PIL.Image.new("RGB", (2, 2)))}
+    audio_dict = {"array": np.zeros(16, dtype=np.float32), "sampling_rate": 16000}
+
+    images, audios = _partition_visuals([img1, None, img2_dict, audio_dict, None])
+
+    assert len(images) == 2
+    assert all(isinstance(img, PIL.Image.Image) for img in images)
+    assert len(audios) == 1
+    arr, sr = audios[0]
+    assert arr.shape == (16,)
+    assert sr == 16000
+
+
+def test_partition_visuals_handles_none_input():
+    from olive.evaluator.lmms_ort import _partition_visuals
+
+    assert _partition_visuals(None) == ([], [])
+    assert _partition_visuals([]) == ([], [])
+
+
+def _png_bytes(image):
+    import io as _io
+
+    buf = _io.BytesIO()
+    image.save(buf, format="PNG")
+    return buf.getvalue()
+
+
+# -----------------------------------------------------------------------------
+# Runtime: _score_continuation and _run_generation
+#
+# These tests construct a LMMSORTGenAIEvaluator with the onnxruntime_genai
+# module wholesale-mocked, so the generation/scoring flows are exercised
+# end-to-end without needing an actual ORT-GenAI model on disk.
+# -----------------------------------------------------------------------------
+
+
+class _FakeTokenStream:
+    def decode(self, tok):
+        return f"<t{tok}>"
+
+
+class _FakeTokenizer:
+    """Minimal stub for og.Tokenizer used by _score_continuation/_run_generation.
+
+    ``tokens_for`` is a dict mapping the exact input string to the list of
+    token ids encode() should return; this lets each test inject specific
+    prompt + (prompt+continuation) tokenizations to drive the slicing logic.
+    """
+
+    def __init__(self, tokens_for, eos_token_ids=(99,)):
+        self._tokens_for = tokens_for
+        self.eos_token_ids = list(eos_token_ids)
+
+    def encode(self, text):
+        if text not in self._tokens_for:
+            raise KeyError(f"_FakeTokenizer: no canned tokenization for text={text!r}")
+        return list(self._tokens_for[text])
+
+    def create_stream(self):
+        return _FakeTokenStream()
+
+
+class _FakeGenerator:
+    """Records call order so tests can assert the score/generation protocol.
+
+    ``logits_queue`` is a list of numpy arrays consumed in order, one per forward
+    pass: ``generate_next_token`` and ``append_tokens`` each consume one entry
+    into ``_current_logits``, which ``get_logits()`` returns.
+    """
+
+    instances: ClassVar[list] = []
+
+    def __init__(self, model, params):
+        type(self).instances.append(self)
+        self._model = model
+        self._params = params
+        self._logits_queue = list(model._next_logits_queue)
+        self._sampled_queue = list(model._next_sampled_queue)
+        self._current_logits = None
+        self._token_count = 0
+        self._done = False
+        self._last_sampled = -1
+        self.calls = []
+
+    def _consume_forward_pass(self):
+        if not self._logits_queue:
+            raise RuntimeError("_FakeGenerator: forward pass exhausted (logits_queue empty)")
+        self._current_logits = self._logits_queue.pop(0)
+
+    def set_inputs(self, inputs):
+        # set_inputs only loads inputs; it does NOT trigger a forward pass and
+        # therefore does NOT populate _current_logits. This is the behavior the
+        # production code at lmms_ort.py:_score_continuation has to compensate
+        # for by calling generate_next_token() to force the prompt-fill compute.
+        self.calls.append(("set_inputs", inputs))
+
+    def generate_next_token(self):
+        self._consume_forward_pass()
+        sampled = self._sampled_queue.pop(0) if self._sampled_queue else -1
+        self._last_sampled = sampled
+        self._token_count += 1
+        self.calls.append(("generate_next_token", sampled))
+
+    def get_logits(self):
+        if self._current_logits is None:
+            raise RuntimeError("_FakeGenerator: get_logits called before any forward pass")
+        return np.asarray(self._current_logits, dtype=np.float32)
+
+    def get_next_tokens(self):
+        return np.array([self._last_sampled], dtype=np.int32)
+
+    def append_tokens(self, tok_array):
+        toks = [int(t) for t in np.asarray(tok_array).reshape(-1)]
+        self.calls.append(("append_tokens", toks))
+        self._token_count += len(toks)
+        # Each appended token batch is one forward pass (computes new last-position logits).
+        self._consume_forward_pass()
+
+    def token_count(self):
+        return self._token_count
+
+    def is_done(self):
+        return self._done
+
+    def rewind_to(self, n):
+        self.calls.append(("rewind_to", n))
+        self._token_count = n
+
+
+class _FakeGeneratorParams:
+    def __init__(self, model):
+        self._model = model
+
+    def set_search_options(self, **kwargs):
+        pass
+
+
+class _FakeProcessor:
+    def __call__(self, prompt, images=None, audios=None):
+        return SimpleNamespace(_prompt=prompt, _images=images, _audios=audios)
+
+
+class _FakeOgModel:
+    def __init__(self, tokenizer=None, next_logits_queue=None, next_sampled_queue=None):
+        self.tokenizer = tokenizer
+        self._next_logits_queue = list(next_logits_queue or [])
+        self._next_sampled_queue = list(next_sampled_queue or [])
+
+    def create_multimodal_processor(self):
+        return _FakeProcessor()
+
+
+def _make_fake_og(model):
+    """Build a SimpleNamespace mimicking the onnxruntime_genai module surface.
+
+    Covers the API surface used by LMMSORTGenAIEvaluator's __init__ + runtime paths.
+    """
+    return SimpleNamespace(
+        Model=lambda *a, **kw: model,
+        Tokenizer=lambda m: model.tokenizer,
+        Generator=_FakeGenerator,
+        GeneratorParams=_FakeGeneratorParams,
+        Images=SimpleNamespace(open=lambda *paths: ("IMG", list(paths))),
+        Audios=SimpleNamespace(open=lambda *paths: ("AUDIO", list(paths))),
+        Config=lambda *a, **kw: SimpleNamespace(
+            clear_providers=lambda: None,
+            append_provider=lambda *a, **kw: None,
+            set_provider_option=lambda *a, **kw: None,
+        ),
+    )
+
+
+def _build_lmms_ortgenai_evaluator(tmp_path, fake_model):
+    """Construct an LMMSORTGenAIEvaluator wired to a fake onnxruntime_genai.
+
+    Returns ``(evaluator, og_patcher)``. The patcher is a context manager that
+    swaps in the fake ``og`` module; tests should call evaluator methods inside
+    ``with og_patcher: ...`` so runtime paths (``_score_continuation``,
+    ``_run_generation``) also use the fake instead of the real ORT-GenAI.
+    """
+    from olive.evaluator import lmms_ort as lmms_ort_mod
+
+    model_dir = tmp_path / "model"
+    model_dir.mkdir()
+    (model_dir / "genai_config.json").write_text('{"model": {"type": "phi4mm"}}', encoding="utf-8")
+
+    fake_og = _make_fake_og(fake_model)
+    _FakeGenerator.instances = []
+    og_patcher = patch.object(lmms_ort_mod, "og", fake_og)
+    with og_patcher:
+        evaluator = LMMSORTGenAIEvaluator(
+            pretrained=str(model_dir),
+            batch_size=1,
+            max_new_tokens=8,
+            max_length=64,
+            execution_provider="cpu",
+            fail_on_error=True,
+        )
+    # Return a fresh patcher for the test to use during runtime calls.
+    return evaluator, patch.object(lmms_ort_mod, "og", fake_og)
+
+
+def test_score_continuation_uses_joint_tokenization_to_slice_continuation(tmp_path):
+    """Encoding ``continuation`` standalone is wrong for sentencepiece/BPE tokenizers.
+
+    The adapter must encode ``prompt + continuation`` jointly and slice off the
+    prompt-aligned prefix so the scored tokens are the ones the model would
+    actually produce extending the prompt. Verify by giving the tokenizer
+    DIFFERENT tokens for ``continuation`` vs the prompt-suffix of
+    ``prompt + continuation``: only the latter should land in append_tokens.
+    """
+    prompt = "<|user|>What is in the image?<|end|><|assistant|>"
+    continuation = "A"
+
+    fake_tokenizer = _FakeTokenizer(
+        {
+            prompt: [1, 2, 3, 4],
+            prompt + continuation: [1, 2, 3, 4, 17, 18],  # cont tokens = [17, 18]
+            continuation: [99999],  # standalone-encoded - MUST NOT be used
+        }
+    )
+
+    # Three forward passes: one prompt-fill (generate_next_token) + two cont tokens.
+    vocab_size = 50
+    logits_prompt_end = np.full(vocab_size, -10.0, dtype=np.float32)
+    logits_prompt_end[17] = 5.0  # greedy = 17
+    logits_after_17 = np.full(vocab_size, -10.0, dtype=np.float32)
+    logits_after_17[18] = 5.0  # greedy = 18
+    logits_after_18 = np.full(vocab_size, -10.0, dtype=np.float32)
+
+    fake_model = _FakeOgModel(
+        tokenizer=fake_tokenizer,
+        next_logits_queue=[logits_prompt_end, logits_after_17, logits_after_18],
+        next_sampled_queue=[42],
+    )
+
+    evaluator, og_patcher = _build_lmms_ortgenai_evaluator(tmp_path, fake_model)
+
+    with og_patcher:
+        logprob, is_greedy = evaluator._score_continuation(prompt, continuation, images=[], audios=[])
+
+    gen = _FakeGenerator.instances[-1]
+    set_inputs_idx = next(i for i, c in enumerate(gen.calls) if c[0] == "set_inputs")
+    next_token_idx = next(i for i, c in enumerate(gen.calls) if c[0] == "generate_next_token")
+    append_calls = [c for c in gen.calls if c[0] == "append_tokens"]
+
+    # set_inputs runs BEFORE the prompt-fill forward pass.
+    assert set_inputs_idx < next_token_idx
+    # The throwaway sample is rewound after the first iteration, before the first
+    # real continuation token is appended.
+    rewind_calls = [c for c in gen.calls if c[0] == "rewind_to"]
+    assert len(rewind_calls) == 1
+    # cont_tokens were correctly sliced from prompt+continuation, NOT taken from
+    # encode(continuation) standalone (which would have been [99999]).
+    assert [tok for _, tok in append_calls] == [[17], [18]]
+    # Both predicted tokens were greedy (== argmax of their position's logits).
+    assert is_greedy is True
+    assert logprob < 0.0  # softmax(logits)[tok] is a probability in (0, 1) -> log negative
+
+
+def test_score_continuation_triggers_forward_pass_before_first_get_logits(tmp_path):
+    """Trigger compute after ``set_inputs`` before reading ``get_logits()``.
+
+    ``set_inputs()`` does not run the decoder forward pass. The adapter must
+    explicitly trigger it (``generate_next_token``) before the first
+    ``get_logits()`` call, or that read returns undefined data.
+    """
+    prompt = "<|user|>x<|end|><|assistant|>"
+    continuation = "y"
+    fake_tokenizer = _FakeTokenizer({prompt: [1, 2], prompt + continuation: [1, 2, 5], continuation: [777]})
+    logits = np.zeros(10, dtype=np.float32)
+    logits[5] = 1.0
+    fake_model = _FakeOgModel(
+        tokenizer=fake_tokenizer,
+        next_logits_queue=[logits, np.zeros(10, dtype=np.float32)],
+        next_sampled_queue=[0],
+    )
+
+    evaluator, og_patcher = _build_lmms_ortgenai_evaluator(tmp_path, fake_model)
+    with og_patcher:
+        evaluator._score_continuation(prompt, continuation, images=[], audios=[])
+
+    gen = _FakeGenerator.instances[-1]
+    # The order must be: set_inputs, generate_next_token (prompt-fill compute),
+    # then the loop's append_tokens. get_logits is read implicitly between
+    # generate_next_token and the first append_tokens.
+    op_names = [c[0] for c in gen.calls]
+    set_inputs_idx = op_names.index("set_inputs")
+    next_token_idx = op_names.index("generate_next_token")
+    first_append_idx = op_names.index("append_tokens")
+    assert set_inputs_idx < next_token_idx < first_append_idx
+
+
+def test_score_continuation_returns_zero_when_continuation_tokenizes_to_empty_suffix(tmp_path):
+    """Short-circuit cleanly when continuation contributes no tokens.
+
+    Happens when prompt+cont tokenizes to the same length as prompt (e.g. cont
+    is just whitespace absorbed by tokenizer normalization). The adapter must
+    short-circuit instead of feeding an empty cont_tokens list to the loop.
+    """
+    prompt = "<|user|>x<|end|><|assistant|>"
+    continuation = ""
+    fake_tokenizer = _FakeTokenizer({prompt: [1, 2, 3], prompt + continuation: [1, 2, 3]})
+    fake_model = _FakeOgModel(tokenizer=fake_tokenizer)
+
+    evaluator, og_patcher = _build_lmms_ortgenai_evaluator(tmp_path, fake_model)
+    with og_patcher:
+        logprob, is_greedy = evaluator._score_continuation(prompt, continuation, images=[], audios=[])
+
+    assert (logprob, is_greedy) == (0.0, True)
+    # No generator should have been constructed for an empty continuation.
+    assert not _FakeGenerator.instances
+
+
+def test_run_generation_stops_on_eos_token(tmp_path):
+    """Verify ``_run_generation`` stops at EOS tokens.
+
+    ``_run_generation`` must respect ``_eos_token_ids`` and stop emitting text
+    once the model samples an EOS token, even before max_new_tokens is reached.
+    """
+    fake_tokenizer = _FakeTokenizer({}, eos_token_ids=[99])
+    logits = np.zeros(100, dtype=np.float32)
+    fake_model = _FakeOgModel(
+        tokenizer=fake_tokenizer,
+        next_logits_queue=[logits] * 5,
+        next_sampled_queue=[7, 8, 99, 10, 11],  # third sample is EOS
+    )
+
+    evaluator, og_patcher = _build_lmms_ortgenai_evaluator(tmp_path, fake_model)
+    with og_patcher:
+        out = evaluator._run_generation("p", images=[], audios=[], max_new_tokens=5)
+
+    # Stream produces "<t{tok}>" per non-EOS token; EOS stops generation.
+    assert out == "<t7><t8>"
+    gen = _FakeGenerator.instances[-1]
+    op_names = [c[0] for c in gen.calls]
+    # Exactly 3 generate_next_token calls happened (7, 8, then EOS stops loop).
+    assert op_names.count("generate_next_token") == 3
+
+
+def test_run_generation_stops_on_explicit_stop_string(tmp_path):
+    """``stop_strings`` should truncate output as soon as a stop sequence appears."""
+    fake_tokenizer = _FakeTokenizer({}, eos_token_ids=[])
+    logits = np.zeros(10, dtype=np.float32)
+    fake_model = _FakeOgModel(
+        tokenizer=fake_tokenizer,
+        next_logits_queue=[logits] * 4,
+        next_sampled_queue=[1, 2, 3, 4],
+    )
+
+    evaluator, og_patcher = _build_lmms_ortgenai_evaluator(tmp_path, fake_model)
+    # _FakeTokenStream emits "<t1><t2><t3>..."; "<t2>" appears after the 2nd token.
+    with og_patcher:
+        out = evaluator._run_generation("p", images=[], audios=[], max_new_tokens=10, stop_strings=["<t2>"])
+
+    # Output is truncated at (but not including) the stop string.
+    assert out == "<t1>"

From ba5d1604c9c8123ca869ee11560eb9d30f8406b6 Mon Sep 17 00:00:00 2001
From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com>
Date: Mon, 15 Jun 2026 20:09:01 +0000
Subject: [PATCH 2/6] evaluator: extend LMMSEvaluator HF dispatch + add
 CompositeToOnnxPackage

Build on top of the LMMSEvaluator + ORT-GenAI multimodal adapter foundation:

- LMMSEvaluator now dispatches HfModelHandler inputs to lmms-eval's native
  per-architecture wrappers (phi4_multimodal, qwen2_5_vl, whisper, ...),
  with auto-detection from HF model_type and a forwarded-kwargs filter
  that only passes args the target wrapper actually declares (handles
  wrappers like qwen2_5_vl which assert kwargs == {}). Enables
  FP-vs-quantized comparison in a single recipe via evaluate_input_model.

- lmms_ort.py adapter: tolerant audio/image disambiguation (audio dicts
  with "path" no longer get mis-routed to PIL.Image.open), Whisper-specific
  prompt + EOS-collision handling so ASR works end-to-end through
  ortgenai_mm without the Phi-4-MM chat-template scaffolding interfering.

- New CompositeToOnnxPackage pass: flattens nested CompositeModel ORT-GenAI
  packages (subdir-per-component or root-level) into the flat layout
  LMMSEvaluator expects. Tolerates extensionless component filenames
  produced by some upstream quant passes.

- Tests: 32 in test_lmms_ort.py (entry-point/registry, HF dispatch,
  kwargs filter, prompt builder, score_continuation, partition_visuals,
  run_generation), 9 in test_composite_to_onnx_package.py (flatten +
  external-data rewrites + fallback entry-point).

Validated end-to-end:
- whisper-large-v3 via HfModel -> ModelBuilder fp16 -> KQuant int8 ->
  CompositeToOnnxPackage -> ortgenai_mm eval on LibriSpeech.
  FP HF WER 1.52/2.26 (clean/other), INT8 ONNX WER 1.68/2.36.
---
 olive/evaluator/lmms_ort.py                   | 172 ++++++---
 olive/evaluator/olive_evaluator.py            |   7 +
 olive/olive_config.json                       |   8 +
 .../passes/onnx/composite_to_onnx_package.py  | 309 +++++++++++++++++
 test/evaluator/test_lmms_ort.py               |  84 +++++
 .../onnx/test_composite_to_onnx_package.py    | 325 ++++++++++++++++++
 6 files changed, 863 insertions(+), 42 deletions(-)
 create mode 100644 olive/passes/onnx/composite_to_onnx_package.py
 create mode 100644 test/passes/onnx/test_composite_to_onnx_package.py

diff --git a/olive/evaluator/lmms_ort.py b/olive/evaluator/lmms_ort.py
index 63fc67fdb8..635f20cf70 100644
--- a/olive/evaluator/lmms_ort.py
+++ b/olive/evaluator/lmms_ort.py
@@ -75,52 +75,71 @@ def decorator(cls):
 # -----------------------------------------------------------------------------
 
 
+_IMAGE_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif"}
+_AUDIO_SUFFIXES = {".wav", ".mp3", ".flac", ".ogg", ".m4a"}
+
+
 def _normalize_image(visual) -> PIL.Image.Image | None:
     if isinstance(visual, PIL.Image.Image):
         return visual.convert("RGB")
     if isinstance(visual, (str, Path)):
         p = Path(visual)
-        if p.suffix.lower() in {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif"}:
+        if p.suffix.lower() in _IMAGE_SUFFIXES:
             return PIL.Image.open(p).convert("RGB")
         return None
     if isinstance(visual, dict):
+        # Audio dicts typically include "sampling_rate" or "array"; skip those.
+        if "sampling_rate" in visual or "array" in visual:
+            return None
         if "bytes" in visual:
             return PIL.Image.open(io.BytesIO(visual["bytes"])).convert("RGB")
         if "path" in visual:
-            return PIL.Image.open(visual["path"]).convert("RGB")
+            p = Path(visual["path"]) if visual["path"] else None
+            if p is not None and p.suffix.lower() in _IMAGE_SUFFIXES:
+                return PIL.Image.open(p).convert("RGB")
+            return None
     if isinstance(visual, np.ndarray):
         return PIL.Image.fromarray(np.uint8(visual)).convert("RGB")
     return None
 
 
 def _normalize_audio(visual) -> tuple[np.ndarray, int] | None:
-    if isinstance(visual, dict) and "array" in visual and "sampling_rate" in visual:
-        return np.asarray(visual["array"], dtype=np.float32), int(visual["sampling_rate"])
+    if isinstance(visual, dict):
+        if "array" in visual and "sampling_rate" in visual:
+            return np.asarray(visual["array"], dtype=np.float32), int(visual["sampling_rate"])
+        if visual.get("path"):
+            return _load_audio_file(Path(visual["path"]))
     if isinstance(visual, (str, Path)):
-        p = Path(visual)
-        if p.suffix.lower() in {".wav", ".mp3", ".flac", ".ogg", ".m4a"}:
-            try:
-                import librosa
-            except ImportError:
-                logger.warning("Audio file %s encountered but librosa not installed.", p)
-                return None
-            arr, sr = librosa.load(str(p), sr=None, mono=True)
-            return arr.astype(np.float32), int(sr)
+        return _load_audio_file(Path(visual))
     return None
 
 
+def _load_audio_file(p: Path) -> tuple[np.ndarray, int] | None:
+    if p.suffix.lower() not in _AUDIO_SUFFIXES or not p.exists():
+        return None
+    try:
+        import librosa
+    except ImportError:
+        logger.warning("Audio file %s encountered but librosa not installed.", p)
+        return None
+    arr, sr = librosa.load(str(p), sr=None, mono=True)
+    return arr.astype(np.float32), int(sr)
+
+
 def _partition_visuals(visuals):
     images, audios = [], []
     for v in visuals or []:
         if v is None:
             continue
-        img = _normalize_image(v)
-        if img is not None:
-            images.append(img)
-            continue
+        # Try audio first since its signature ("array"+"sampling_rate") is more
+        # distinctive than the image path/bytes/PIL signatures.
         au = _normalize_audio(v)
         if au is not None:
             audios.append(au)
+            continue
+        img = _normalize_image(v)
+        if img is not None:
+            images.append(img)
     return images, audios
 
 
@@ -138,12 +157,23 @@ def _build_prompt(
     image_token_format: str = "<|image_{index}|>",
     audio_token_format: str = "<|audio_{index}|>",
 ) -> str:
-    """Build a Phi-4-multimodal-style chat prompt.
+    """Build a chat-style prompt for the model.
+
+    Defaults to Phi-4-multimodal's chat template. For Whisper (pure ASR; no
+    text prompt, no chat template), returns an empty string — the ORT-GenAI
+    multimodal processor builds the decoder start tokens from the audio input
+    plus genai_config defaults.
 
     Other multimodal architectures use different placeholder tags. Users can
-    override the media token formats and the full prompt template from the Olive
-    evaluator config without changing this adapter.
+    override the media token formats and the full prompt template from the
+    Olive evaluator config without changing this adapter.
     """
+    if model_type == "whisper":
+        # Whisper has no chat template; the "prompt" is just the decoder-start
+        # token sequence that conditions the model on language + task. This
+        # matches ORT-GenAI's benchmark_multimodal.py reference.
+        # Source: microsoft/onnxruntime-genai benchmark/python/benchmark_multimodal.py
+        return "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
     image_tokens = "".join(_format_media_tokens(num_images, image_token_format))
     audio_tokens = "".join(_format_media_tokens(num_audios, audio_token_format))
     parts = [image_tokens, audio_tokens, user_text]
@@ -258,6 +288,18 @@ def __init__(
         self._tokenizer = og.Tokenizer(self._model)
         self._processor = self._model.create_multimodal_processor()
 
+        # Default prompt-builder path: og.Tokenizer.apply_chat_template (matches
+        # PR #2488's OnnxEvaluator._inference_vision_genai and the olive-recipes
+        # eval scripts for Qwen2.5-VL, Qwen3-VL, and google-gemma-4). Older
+        # onnxruntime-genai versions don't expose this method, in which case we
+        # fall back to the legacy format-string path (_build_prompt below).
+        self._has_chat_template = hasattr(self._tokenizer, "apply_chat_template")
+        if not self._has_chat_template:
+            logger.warning(
+                "ORT-GenAI tokenizer does not expose apply_chat_template; falling back to "
+                "legacy format-string prompt building. Consider upgrading onnxruntime-genai."
+            )
+
         eos_ids = self._tokenizer.eos_token_ids
         self._eos_token_ids = {int(t) for t in (eos_ids if eos_ids is not None else [])}
 
@@ -338,7 +380,10 @@ def _run_generation(
             og_audios = self._build_og_audios(audios, tmp_dir)
 
             try:
-                inputs = self._processor(prompt, images=og_images, audios=og_audios)
+                # ORT-GenAI processors accept either a bare string or a list of
+                # strings depending on backend; benchmark_multimodal.py wraps in
+                # a list, which matches both the whisper and phi4mm paths.
+                inputs = self._processor([prompt], images=og_images, audios=og_audios)
             except Exception as e:  # pragma: no cover
                 del generator
                 return self._handle_error("ORT-GenAI multimodal processor failed.", e, "")
@@ -351,14 +396,23 @@ def _run_generation(
                     "ORT-GenAI generator input setup failed. The prompt may exceed max_length.", e, ""
                 )
 
+            # Whisper's BOS == EOS (token 50257 = <|startoftranscript|> = <|endoftext|>),
+            # so the very first generated token can collide with EOS. Skip the
+            # EOS check until we've emitted at least one non-EOS token.
             decoded = ""
             stream = self._tokenizer.create_stream()
             steps = 0
+            generated_any = False
             while not generator.is_done() and steps < max_new_tokens:
                 generator.generate_next_token()
                 tok = int(generator.get_next_tokens()[0])
                 if tok in self._eos_token_ids:
-                    break
+                    if generated_any:
+                        break
+                    # First-step EOS collision with BOS; skip and keep generating.
+                    steps += 1
+                    continue
+                generated_any = True
                 decoded += stream.decode(tok)
                 if stop_strings:
                     for s in stop_strings:
@@ -441,6 +495,58 @@ def _score_continuation(self, prompt: str, continuation: str, images, audios) ->
         del generator
         return total_logprob, all_greedy
 
+    def _build_prompt_for_request(self, user_text: str, num_images: int, num_audios: int) -> str:
+        """Build the final prompt string fed to ``og.MultiModalProcessor``.
+
+        Default path: pre-render image/audio markers into the user content
+        string using ``image_token_format`` / ``audio_token_format``, then call
+        ``og.Tokenizer.apply_chat_template`` to add the model-specific chat
+        scaffolding (system/user/assistant turn markers).
+
+        Pure content-parts (``{"type": "image"}``) is what PR #2488 and the
+        olive-recipes Qwen2.5-VL eval scripts do, and it works for chat
+        templates that understand structured content (Qwen2.5-VL, Qwen3-VL,
+        Gemma-4). However, Phi-4-MM's chat template stringifies content lists
+        as Python repr (verified: produces
+        ``<|user|>[{'type': 'image'}, ...]<|end|>`` instead of injecting
+        ``<|image_1|>``). Pre-rendering the markers ourselves before
+        ``apply_chat_template`` works for both conventions, since templates
+        that just pass through user content render identically either way.
+
+        Fallback path: ``_build_prompt`` legacy format-string. Used when the
+        user has explicitly set ``prompt_template`` in the evaluator config
+        (to override per-benchmark) or when the underlying onnxruntime-genai
+        version predates ``apply_chat_template`` on ``og.Tokenizer``.
+        """
+        if self._model_type == "whisper":
+            # Whisper has no chat template; the "prompt" is just the decoder-start
+            # token sequence that conditions on language + task. user_text from
+            # lmms-eval tasks (e.g. "Please recognize the speech...") is ignored.
+            return _build_prompt(self._model_type, num_images, num_audios, user_text)
+
+        if self.prompt_template or not self._has_chat_template:
+            return _build_prompt(
+                self._model_type,
+                num_images,
+                num_audios,
+                user_text,
+                self.system_prompt,
+                self.prompt_template,
+                self.image_token_format,
+                self.audio_token_format,
+            )
+
+        image_markers = "".join(_format_media_tokens(num_images, self.image_token_format))
+        audio_markers = "".join(_format_media_tokens(num_audios, self.audio_token_format))
+        user_content = f"{image_markers}{audio_markers}{user_text}"
+
+        messages: list[dict[str, Any]] = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+        messages.append({"role": "user", "content": user_content})
+
+        return self._tokenizer.apply_chat_template(json.dumps(messages), add_generation_prompt=True)
+
     def _get_doc_and_visuals(self, doc_to_visual, doc_id, task, split):
         try:
             doc = self.task_dict[task][split][doc_id]
@@ -473,16 +579,7 @@ def generate_until(self, requests: list[Instance], disable_tqdm: bool = False) -
             if isinstance(stop, str):
                 stop = [stop]
 
-            prompt = _build_prompt(
-                self._model_type,
-                len(images),
-                len(audios),
-                contexts,
-                self.system_prompt,
-                self.prompt_template,
-                self.image_token_format,
-                self.audio_token_format,
-            )
+            prompt = self._build_prompt_for_request(contexts, len(images), len(audios))
             text = self._run_generation(prompt, images, audios, max_new, stop)
             results.append(text)
             self.cache_hook.add_partial("generate_until", (contexts, gen_kwargs), text)
@@ -499,16 +596,7 @@ def loglikelihood(self, requests: list[Instance], disable_tqdm: bool = False) ->
             images, audios = _partition_visuals(visuals)
             continuation = str(doc_to_target(doc))
 
-            prompt = _build_prompt(
-                self._model_type,
-                len(images),
-                len(audios),
-                contexts,
-                self.system_prompt,
-                self.prompt_template,
-                self.image_token_format,
-                self.audio_token_format,
-            )
+            prompt = self._build_prompt_for_request(contexts, len(images), len(audios))
             logprob, is_greedy = self._score_continuation(prompt, continuation, images, audios)
             results.append((logprob, is_greedy))
             self.cache_hook.add_partial("loglikelihood", (contexts, continuation), (logprob, is_greedy))
diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py
index af73fabe64..ee7d63286e 100644
--- a/olive/evaluator/olive_evaluator.py
+++ b/olive/evaluator/olive_evaluator.py
@@ -2084,6 +2084,13 @@ def __init__(self, tasks: list[str], **kwargs):
         self.prompt_template = kwargs.get("prompt_template")
         self.image_token_format = kwargs.get("image_token_format", "<|image_{index}|>")
         self.audio_token_format = kwargs.get("audio_token_format", "<|audio_{index}|>")
+        # NOTE: ``prompt_template`` / ``image_token_format`` / ``audio_token_format``
+        # are legacy format-string knobs and should rarely be needed. By default
+        # the ortgenai_mm adapter calls ``og.Tokenizer.apply_chat_template`` (same
+        # path used by PR #2488 and olive-recipes eval scripts), which reads the
+        # package's ``chat_template.jinja`` and produces the correct chat format
+        # for every supported model automatically. Setting ``prompt_template``
+        # forces the adapter into the legacy hand-templated path.
         # HF-only knobs (forwarded to lmms-eval's native wrapper if present).
         # ``trust_remote_code`` defaults to False to match the rest of Olive
         # (e.g. olive/common/hf/utils.py, olive/data/component/load_dataset.py)
diff --git a/olive/olive_config.json b/olive/olive_config.json
index 1978e61dcb..bac23f921c 100644
--- a/olive/olive_config.json
+++ b/olive/olive_config.json
@@ -8,6 +8,14 @@
             "supported_algorithms": [  ],
             "supported_quantization_encodings": [  ]
         },
+        "CompositeToOnnxPackage": {
+            "module_path": "olive.passes.onnx.composite_to_onnx_package.CompositeToOnnxPackage",
+            "supported_providers": [ "*" ],
+            "supported_accelerators": [ "*" ],
+            "supported_precisions": [ "*" ],
+            "supported_algorithms": [  ],
+            "supported_quantization_encodings": [  ]
+        },
         "AimetQuantization": {
             "module_path": "olive.passes.onnx.aimet_quantization.AimetQuantization",
             "supported_providers": [ "*" ],
diff --git a/olive/passes/onnx/composite_to_onnx_package.py b/olive/passes/onnx/composite_to_onnx_package.py
new file mode 100644
index 0000000000..5f0814164f
--- /dev/null
+++ b/olive/passes/onnx/composite_to_onnx_package.py
@@ -0,0 +1,309 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""Convert a multi-component CompositeModel ORT-GenAI package into a flat ONNX package."""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import onnx
+
+from olive.common.utils import hardlink_copy_file
+from olive.model import ONNXModelHandler
+from olive.model.handler.composite import CompositeModelHandler
+from olive.passes import Pass
+from olive.passes.pass_config import BasePassConfig, PassConfigParam
+
+if TYPE_CHECKING:
+    from olive.hardware.accelerator import AcceleratorSpec
+
+logger = logging.getLogger(__name__)
+
+
+class CompositeToOnnxPackage(Pass):
+    """Flatten a CompositeModel ORT-GenAI package into a single ONNXModel handler.
+
+    MobiusBuilder and similar passes emit multi-component ORT-GenAI packages as a
+    :class:`CompositeModelHandler` whose components live in subdirectories::
+
+        output_dir/
+          genai_config.json
+          tokenizer.json
+          decoder/model.onnx
+          vision_encoder/model.onnx
+          audio_encoder/model.onnx
+          embedding/model.onnx
+
+    Olive's evaluators (e.g. ``OnnxEvaluator._inference_vision_genai`` and the
+    ``LMMSEvaluator``) detect ORT-GenAI packages by looking for ``genai_config.json``
+    next to an ONNX file referenced by an :class:`ONNXModelHandler`. The nested
+    subdirectory layout above defeats that detection because the entry-point ONNX
+    file's parent (e.g. ``output_dir/decoder/``) does not contain
+    ``genai_config.json``.
+
+    This pass produces an equivalent flat layout::
+
+        output_dir/
+          genai_config.json
+          tokenizer.json
+          decoder.onnx
+          vision_encoder.onnx
+          audio_encoder.onnx
+          embedding.onnx
+
+    by hardlinking each component (and its ``.onnx.data`` sidecar, if present) to
+    the package root and rewriting ``genai_config.json`` to reference the flat
+    filenames. The returned :class:`ONNXModelHandler` points at the entry-point
+    component (defaults to ``decoder``), so downstream evaluators can auto-detect
+    the package via ``Path(model.model_path).parent / "genai_config.json"``.
+    """
+
+    _accepts_composite_model = True
+
+    @classmethod
+    def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassConfigParam]:
+        return {
+            "entry_point_component": PassConfigParam(
+                type_=str,
+                default_value="decoder",
+                description=(
+                    "Name of the genai_config 'model' subsection (e.g. 'decoder', 'text') "
+                    "whose ONNX file the returned ONNXModelHandler will point at. If the "
+                    "name is not found, falls back to the first component with a 'filename' field."
+                ),
+            ),
+        }
+
+    @classmethod
+    def is_accelerator_agnostic(cls, accelerator_spec: AcceleratorSpec) -> bool:
+        # Pure file-system / config rewrite — no EP-specific behavior.
+        return True
+
+    def _run_for_config(
+        self,
+        model: CompositeModelHandler,
+        config: type[BasePassConfig],
+        output_model_path: str,
+    ) -> ONNXModelHandler:
+        if not isinstance(model, CompositeModelHandler):
+            raise ValueError(
+                f"CompositeToOnnxPackage expects a CompositeModelHandler input, got {type(model).__name__}."
+            )
+
+        src_dir = Path(model.model_path).resolve()
+        if not src_dir.is_dir():
+            raise ValueError(f"CompositeModel model_path is not a directory: {src_dir}")
+
+        src_genai_config = src_dir / "genai_config.json"
+        if not src_genai_config.is_file():
+            raise ValueError(
+                f"CompositeToOnnxPackage requires genai_config.json at the package root: {src_genai_config} not found."
+            )
+
+        dst_dir = self._resolve_output_dir(output_model_path)
+        dst_dir.mkdir(parents=True, exist_ok=True)
+
+        genai_config = json.loads(src_genai_config.read_text(encoding="utf-8"))
+        model_section = genai_config.get("model")
+        if not isinstance(model_section, dict):
+            raise ValueError(f"Invalid genai_config.json at {src_genai_config}: missing 'model' section.")
+
+        rewrite_map = self._build_rewrite_map(model_section)
+        if not rewrite_map:
+            raise ValueError(
+                f"No component subsections with 'filename' found in genai_config.json at {src_genai_config}."
+            )
+
+        # Copy each component ONNX into the flat layout, rewriting external-data
+        # references so each initializer points at the renamed sidecar.
+        #
+        # We can't just hardlink the .onnx file and its .data sidecar to the new
+        # names, because each ONNX file embeds the external-data filename
+        # ("location" entry in the proto). After renaming, those embedded
+        # pointers still reference the old name (e.g. "model.onnx.data") and
+        # ONNX Runtime fails at load with "External data path does not exist".
+        # ``onnx.save_model(..., save_as_external_data=True, location=...)``
+        # serializes a new ONNX file whose embedded location matches the new
+        # filename, and writes the corresponding .data file alongside.
+        for old_rel, new_name in rewrite_map.items():
+            src_file = self._resolve_component_source(src_dir, old_rel)
+            if src_file is None:
+                raise ValueError(f"Component file referenced by genai_config not found: {src_dir / old_rel}")
+
+            src_data = self._resolve_component_data(src_file)
+            dst_file = dst_dir / new_name
+            dst_data_name = f"{new_name}.data"
+            dst_data_file = dst_dir / dst_data_name
+
+            if src_data is not None:
+                # Load model and resolve external initializer tensors so we can
+                # re-serialize them under the new filename. ``load_external_data
+                # =True`` (the default) materializes initializer bytes into the
+                # in-memory proto via the source directory layout, after which
+                # we can write them back out with a new ``location``.
+                onnx_model = onnx.load(str(src_file), load_external_data=True)
+                # Remove any pre-existing destination files to avoid onnx
+                # appending to a stale .data sidecar on rerun.
+                if dst_file.exists():
+                    dst_file.unlink()
+                if dst_data_file.exists():
+                    dst_data_file.unlink()
+                onnx.save_model(
+                    onnx_model,
+                    str(dst_file),
+                    save_as_external_data=True,
+                    all_tensors_to_one_file=True,
+                    location=dst_data_name,
+                )
+            else:
+                # No external data sidecar — model is self-contained, plain copy.
+                hardlink_copy_file(src_file, dst_file)
+
+        # Copy every top-level shared sidecar (tokenizer, processor configs, chat template, etc.).
+        # genai_config.json is rewritten below, so skip it here.
+        for src_file in src_dir.iterdir():
+            if not src_file.is_file() or src_file.name == "genai_config.json":
+                continue
+            dst_file = dst_dir / src_file.name
+            if not dst_file.exists():
+                hardlink_copy_file(src_file, dst_file)
+
+        # Update filename references and write the rewritten config.
+        for component_cfg in model_section.values():
+            if isinstance(component_cfg, dict):
+                old_name = component_cfg.get("filename")
+                if isinstance(old_name, str) and old_name in rewrite_map:
+                    component_cfg["filename"] = rewrite_map[old_name]
+
+        (dst_dir / "genai_config.json").write_text(
+            json.dumps(genai_config, indent=2),
+            encoding="utf-8",
+        )
+
+        entry_filename = self._select_entry_filename(model_section, config.entry_point_component)
+        if entry_filename is None:
+            raise ValueError(
+                "Failed to determine an entry-point component for CompositeToOnnxPackage. "
+                f"Requested '{config.entry_point_component}', no component matched and no fallback available."
+            )
+
+        logger.info(
+            "CompositeToOnnxPackage: flattened %d components into '%s' (entry_point=%s)",
+            len(rewrite_map),
+            dst_dir,
+            entry_filename,
+        )
+
+        return ONNXModelHandler(
+            model_path=str(dst_dir),
+            onnx_file_name=entry_filename,
+            model_attributes={
+                "ort_genai_package": True,
+                "entry_point_component": config.entry_point_component,
+                "flattened_from_composite": True,
+                **(model.model_attributes or {}),
+            },
+        )
+
+    @staticmethod
+    def _resolve_output_dir(output_model_path: str) -> Path:
+        """Olive sometimes passes a `.onnx` file path; in that case use its stem as the directory."""
+        output_path = Path(output_model_path)
+        if output_path.suffix == ".onnx":
+            return output_path.parent / output_path.stem
+        return output_path
+
+    @staticmethod
+    def _resolve_component_source(src_dir: Path, old_rel: str) -> Path | None:
+        """Resolve the on-disk source file for a component referenced by genai_config.
+
+        Some upstream Olive passes (notably ``OnnxKQuantQuantization`` when given a
+        component already named ``decoder.onnx``) save the quantized model with
+        the ``.onnx`` extension stripped — producing ``decoder``/``encoder`` files
+        next to ``decoder.data``/``encoder.data`` while ``genai_config.json`` still
+        references the original ``decoder.onnx``/``encoder.onnx``. Accept the
+        extensionless variant so we can still flatten such packages without
+        requiring an upstream fix.
+        """
+        candidate = src_dir / old_rel
+        if candidate.is_file():
+            return candidate
+        stripped = src_dir / Path(old_rel).stem
+        if stripped.is_file():
+            return stripped
+        return None
+
+    @staticmethod
+    def _resolve_component_data(src_file: Path) -> Path | None:
+        """Resolve the external-data sidecar for a component source file.
+
+        Tries ``<src>.data`` first (matches ONNX's default ``<filename>.data``
+        sidecar). For extensionless source files emitted by buggy upstream
+        passes, also accepts ``<src_stem>.data`` (e.g. ``decoder`` + ``decoder.data``).
+        """
+        primary = src_file.with_name(src_file.name + ".data")
+        if primary.is_file():
+            return primary
+        if src_file.suffix == "":
+            alt = src_file.with_name(src_file.stem + ".data")
+            if alt.is_file():
+                return alt
+        return None
+
+    @staticmethod
+    def _build_rewrite_map(model_section: dict) -> dict[str, str]:
+        """Map each old relative filename to a unique flat root-level filename.
+
+        Uses the immediate parent directory name when the component lives in a
+        subdirectory (``decoder/model.onnx`` -> ``decoder.onnx``). Falls back to
+        the genai_config key when the file is already flat or the parent name
+        collides. Guarantees uniqueness by appending a counter if needed.
+        """
+        used_names: set[str] = set()
+        rewrite_map: dict[str, str] = {}
+
+        for component_key, component_cfg in model_section.items():
+            if not isinstance(component_cfg, dict):
+                continue
+            old_path = component_cfg.get("filename")
+            if not isinstance(old_path, str) or not old_path:
+                continue
+            if old_path in rewrite_map:
+                continue
+
+            old_path_obj = Path(old_path)
+            parent_name = old_path_obj.parent.name
+            candidate_base = parent_name or component_key
+            candidate = f"{candidate_base}.onnx"
+
+            if candidate in used_names:
+                suffix = 1
+                while f"{candidate_base}_{suffix}.onnx" in used_names:
+                    suffix += 1
+                candidate = f"{candidate_base}_{suffix}.onnx"
+
+            used_names.add(candidate)
+            rewrite_map[old_path] = candidate
+
+        return rewrite_map
+
+    @staticmethod
+    def _select_entry_filename(model_section: dict, entry_point_component: str) -> str | None:
+        """Pick the flat filename for the entry-point component, falling back if missing."""
+        preferred = model_section.get(entry_point_component)
+        if isinstance(preferred, dict):
+            filename = preferred.get("filename")
+            if isinstance(filename, str):
+                return filename
+
+        for component_cfg in model_section.values():
+            if isinstance(component_cfg, dict):
+                filename = component_cfg.get("filename")
+                if isinstance(filename, str):
+                    return filename
+        return None
diff --git a/test/evaluator/test_lmms_ort.py b/test/evaluator/test_lmms_ort.py
index 408be890ad..595b158224 100644
--- a/test/evaluator/test_lmms_ort.py
+++ b/test/evaluator/test_lmms_ort.py
@@ -62,6 +62,90 @@ def test_normalize_execution_provider(execution_provider, expected):
     assert _normalize_execution_provider(execution_provider) == expected
 
 
+def _make_evaluator_for_prompt_tests():
+    """Construct an LMMSORTGenAIEvaluator with __init__ skipped.
+
+    Lets us unit-test the prompt-building path without needing a real ORT-GenAI
+    model on disk.
+    """
+    inst = LMMSORTGenAIEvaluator.__new__(LMMSORTGenAIEvaluator)
+    inst._tokenizer = MagicMock(name="og.Tokenizer")
+    inst._model_type = "test_model"
+    inst.system_prompt = "You are helpful."
+    inst.prompt_template = None
+    inst.image_token_format = "<|image_{index}|>"
+    inst.audio_token_format = "<|audio_{index}|>"
+    inst._has_chat_template = True
+    return inst
+
+
+def test_build_prompt_for_request_uses_apply_chat_template_by_default():
+    inst = _make_evaluator_for_prompt_tests()
+    inst._tokenizer.apply_chat_template.return_value = "<rendered chat prompt>"
+
+    out = inst._build_prompt_for_request("What is in the image?", num_images=1, num_audios=0)
+
+    assert out == "<rendered chat prompt>"
+    inst._tokenizer.apply_chat_template.assert_called_once()
+    messages_json_arg = inst._tokenizer.apply_chat_template.call_args.args[0]
+    assert inst._tokenizer.apply_chat_template.call_args.kwargs.get("add_generation_prompt") is True
+
+    import json as _json
+
+    messages = _json.loads(messages_json_arg)
+    assert messages[0] == {"role": "system", "content": "You are helpful."}
+    assert messages[1] == {"role": "user", "content": "<|image_1|>What is in the image?"}
+
+
+def test_build_prompt_for_request_skips_system_when_empty():
+    inst = _make_evaluator_for_prompt_tests()
+    inst.system_prompt = ""
+    inst._tokenizer.apply_chat_template.return_value = "out"
+
+    inst._build_prompt_for_request("Q", num_images=0, num_audios=0)
+
+    import json as _json
+
+    messages = _json.loads(inst._tokenizer.apply_chat_template.call_args.args[0])
+    assert all(m["role"] != "system" for m in messages)
+    assert messages[-1] == {"role": "user", "content": "Q"}
+
+
+def test_build_prompt_for_request_includes_audio_markers():
+    inst = _make_evaluator_for_prompt_tests()
+    inst.system_prompt = ""
+    inst._tokenizer.apply_chat_template.return_value = "out"
+
+    inst._build_prompt_for_request("Q", num_images=2, num_audios=1)
+
+    import json as _json
+
+    messages = _json.loads(inst._tokenizer.apply_chat_template.call_args.args[0])
+    assert messages[0]["content"] == "<|image_1|><|image_2|><|audio_1|>Q"
+
+
+def test_build_prompt_for_request_falls_back_to_legacy_when_prompt_template_set():
+    inst = _make_evaluator_for_prompt_tests()
+    inst.prompt_template = "{system_prompt}|{user_content}"
+
+    out = inst._build_prompt_for_request("Q", num_images=1, num_audios=0)
+
+    assert out == "You are helpful.|<|image_1|>Q"
+    inst._tokenizer.apply_chat_template.assert_not_called()
+
+
+def test_build_prompt_for_request_falls_back_when_chat_template_unavailable():
+    inst = _make_evaluator_for_prompt_tests()
+    inst._has_chat_template = False  # simulate older onnxruntime-genai
+
+    out = inst._build_prompt_for_request("Q", num_images=1, num_audios=0)
+
+    # Default legacy template wraps with Phi-4-MM-style tokens.
+    assert "<|image_1|>" in out
+    assert "Q" in out
+    inst._tokenizer.apply_chat_template.assert_not_called()
+
+
 def test_lmms_evaluator_converts_lmms_results(tmp_path):
     model_dir = tmp_path / "model"
     model_dir.mkdir()
diff --git a/test/passes/onnx/test_composite_to_onnx_package.py b/test/passes/onnx/test_composite_to_onnx_package.py
new file mode 100644
index 0000000000..8a6ea02b96
--- /dev/null
+++ b/test/passes/onnx/test_composite_to_onnx_package.py
@@ -0,0 +1,325 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+"""Tests for the CompositeToOnnxPackage pass."""
+
+import json
+from pathlib import Path
+
+import numpy as np
+import onnx
+import pytest
+from onnx import TensorProto, helper, numpy_helper
+
+from olive.model import ONNXModelHandler
+from olive.model.handler.composite import CompositeModelHandler
+from olive.passes.olive_pass import create_pass_from_dict
+from olive.passes.onnx.composite_to_onnx_package import CompositeToOnnxPackage
+
+
+def _write_tiny_onnx_with_external_data(onnx_path: Path, data_filename: str = "model.onnx.data") -> None:
+    """Write a minimal valid ONNX model whose single initializer lives in an external data sidecar.
+
+    The initializer is sized above onnx's default external-data size threshold
+    (1024 bytes) so the .data sidecar actually gets written. The model itself
+    stays tiny (one Identity node) so the test fixture remains cheap.
+    """
+    # 1024 floats = 4096 bytes, well above the default 1024-byte threshold for
+    # promoting an initializer to external storage.
+    data = np.arange(1024, dtype=np.float32)
+    init_tensor = numpy_helper.from_array(data, name="weight")
+    output = helper.make_tensor_value_info("y", TensorProto.FLOAT, [1024])
+    node = helper.make_node("Identity", inputs=["weight"], outputs=["y"])
+    graph = helper.make_graph([node], "g", inputs=[], outputs=[output], initializer=[init_tensor])
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 17)])
+
+    onnx_path.parent.mkdir(parents=True, exist_ok=True)
+    onnx.save_model(
+        model,
+        str(onnx_path),
+        save_as_external_data=True,
+        all_tensors_to_one_file=True,
+        location=data_filename,
+    )
+
+
+def _write_tiny_inline_onnx(onnx_path: Path) -> None:
+    """Write a minimal self-contained (no external data) ONNX model."""
+    init_tensor = numpy_helper.from_array(np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32), name="weight")
+    output = helper.make_tensor_value_info("y", TensorProto.FLOAT, [4])
+    node = helper.make_node("Identity", inputs=["weight"], outputs=["y"])
+    graph = helper.make_graph([node], "g", inputs=[], outputs=[output], initializer=[init_tensor])
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 17)])
+    onnx_path.parent.mkdir(parents=True, exist_ok=True)
+    onnx.save_model(model, str(onnx_path))
+
+
+def _make_nested_genai_package(
+    root: Path,
+    components: dict[str, str],
+    *,
+    with_external_data: bool = True,
+) -> Path:
+    """Build a fake nested ORT-GenAI package at ``root``.
+
+    ``components`` maps genai_config component keys (e.g. ``decoder``) to the
+    relative ONNX filename under ``root`` (e.g. ``decoder/model.onnx``). Each
+    component file is a real (tiny) ONNX model so the pass exercises real
+    external-data rewriting rather than file rename only.
+    """
+    root.mkdir(parents=True, exist_ok=True)
+
+    model_section: dict[str, dict[str, str]] = {}
+    for key, rel_path in components.items():
+        component_file = root / rel_path
+        if with_external_data:
+            _write_tiny_onnx_with_external_data(component_file)
+        else:
+            _write_tiny_inline_onnx(component_file)
+        model_section[key] = {"filename": rel_path}
+
+    # Shared root-level sidecars.
+    (root / "tokenizer.json").write_text("{}", encoding="utf-8")
+    (root / "chat_template.jinja").write_text("template", encoding="utf-8")
+
+    (root / "genai_config.json").write_text(
+        json.dumps({"model": {"type": "gemma4", **model_section}}, indent=2),
+        encoding="utf-8",
+    )
+    return root
+
+
+def _make_composite_handler(root: Path, components: dict[str, str]) -> CompositeModelHandler:
+    component_handlers = [ONNXModelHandler(model_path=str(root / rel_path)) for rel_path in components.values()]
+    return CompositeModelHandler(
+        model_components=component_handlers,
+        model_component_names=list(components.keys()),
+        model_path=str(root),
+    )
+
+
+class TestCompositeToOnnxPackage:
+    def test_flattens_nested_package_to_root_level_filenames(self, tmp_path):
+        src_root = _make_nested_genai_package(
+            tmp_path / "src",
+            {
+                "decoder": "decoder/model.onnx",
+                "vision": "vision_encoder/model.onnx",
+                "audio": "audio_encoder/model.onnx",
+                "embedding": "embedding/model.onnx",
+            },
+        )
+        composite = _make_composite_handler(
+            src_root,
+            {
+                "decoder": "decoder/model.onnx",
+                "vision_encoder": "vision_encoder/model.onnx",
+                "audio_encoder": "audio_encoder/model.onnx",
+                "embedding": "embedding/model.onnx",
+            },
+        )
+
+        p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True)
+        out = p.run(composite, str(tmp_path / "out"))
+
+        out_dir = Path(out.model_path).parent
+        assert (out_dir / "decoder.onnx").is_file()
+        assert (out_dir / "vision_encoder.onnx").is_file()
+        assert (out_dir / "audio_encoder.onnx").is_file()
+        assert (out_dir / "embedding.onnx").is_file()
+        assert (out_dir / "genai_config.json").is_file()
+        assert (out_dir / "tokenizer.json").is_file()
+        assert (out_dir / "chat_template.jinja").is_file()
+
+    def test_rewrites_genai_config_filenames(self, tmp_path):
+        src_root = _make_nested_genai_package(
+            tmp_path / "src",
+            {"decoder": "decoder/model.onnx", "vision": "vision_encoder/model.onnx"},
+        )
+        composite = _make_composite_handler(
+            src_root,
+            {
+                "decoder": "decoder/model.onnx",
+                "vision_encoder": "vision_encoder/model.onnx",
+            },
+        )
+
+        p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True)
+        out = p.run(composite, str(tmp_path / "out"))
+
+        rewritten = json.loads((Path(out.model_path).parent / "genai_config.json").read_text(encoding="utf-8"))
+        assert rewritten["model"]["decoder"]["filename"] == "decoder.onnx"
+        assert rewritten["model"]["vision"]["filename"] == "vision_encoder.onnx"
+
+    def test_returns_onnx_handler_with_entry_point_next_to_genai_config(self, tmp_path):
+        src_root = _make_nested_genai_package(
+            tmp_path / "src",
+            {"decoder": "decoder/model.onnx", "vision": "vision_encoder/model.onnx"},
+        )
+        composite = _make_composite_handler(
+            src_root,
+            {
+                "decoder": "decoder/model.onnx",
+                "vision_encoder": "vision_encoder/model.onnx",
+            },
+        )
+
+        p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True)
+        out = p.run(composite, str(tmp_path / "out"))
+
+        assert isinstance(out, ONNXModelHandler)
+        # Evaluator-style auto-detection: parent of model_path must contain genai_config.json
+        parent = Path(out.model_path).parent
+        assert (parent / "genai_config.json").is_file()
+        assert Path(out.model_path).name == "decoder.onnx"
+
+    def test_rewrites_external_data_location_to_new_filename(self, tmp_path):
+        """External-data references inside each component ONNX must point at the renamed sidecar.
+
+        Regression test for a real-world failure: hardlinking a .onnx file +
+        its .data sidecar to new names left the embedded "location" pointer
+        inside the proto pointing at the old name (e.g. "model.onnx.data"),
+        causing ONNX Runtime to fail at load with "External data path does not
+        exist". The pass must rewrite each component model's external-data
+        location to match its new flat filename, and produce a real .data file
+        with the new name alongside it.
+        """
+        src_root = _make_nested_genai_package(tmp_path / "src", {"decoder": "decoder/model.onnx"})
+        composite = _make_composite_handler(src_root, {"decoder": "decoder/model.onnx"})
+
+        p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True)
+        out = p.run(composite, str(tmp_path / "out"))
+
+        out_dir = Path(out.model_path).parent
+        # Both the flat ONNX file and the matching renamed sidecar must exist.
+        assert (out_dir / "decoder.onnx").is_file()
+        assert (out_dir / "decoder.onnx.data").is_file()
+
+        # The embedded external-data location inside the rewritten ONNX file
+        # must reference the new sidecar name, not the source layout's
+        # "model.onnx.data". Load without materializing external data so the
+        # initializer keeps its ``external_data`` pointer rather than getting
+        # the bytes inlined as ``raw_data``.
+        proto_only = onnx.load(str(out_dir / "decoder.onnx"), load_external_data=False)
+        weight = next(t for t in proto_only.graph.initializer if t.name == "weight")
+        location_entries = [entry.value for entry in weight.external_data if entry.key == "location"]
+        assert location_entries == ["decoder.onnx.data"], (
+            f"expected location='decoder.onnx.data', got external_data={list(weight.external_data)}"
+        )
+
+        # And the bytes should actually load through that new pointer (catches
+        # the case where the .data file was written under the right name but
+        # corrupted, or vice versa).
+        materialized = onnx.load(str(out_dir / "decoder.onnx"), load_external_data=True)
+        loaded_weight = next(t for t in materialized.graph.initializer if t.name == "weight")
+        loaded_array = numpy_helper.to_array(loaded_weight)
+        assert loaded_array.shape == (1024,)
+        assert loaded_array[0] == 0.0
+        assert loaded_array[-1] == 1023.0
+
+    def test_handles_inline_onnx_without_external_data(self, tmp_path):
+        """Self-contained ONNX models (no .data sidecar) should still flatten correctly."""
+        src_root = _make_nested_genai_package(
+            tmp_path / "src",
+            {"decoder": "decoder/model.onnx"},
+            with_external_data=False,
+        )
+        composite = _make_composite_handler(src_root, {"decoder": "decoder/model.onnx"})
+
+        p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True)
+        out = p.run(composite, str(tmp_path / "out"))
+
+        out_dir = Path(out.model_path).parent
+        assert (out_dir / "decoder.onnx").is_file()
+        # No external-data sidecar should be present since the source had none.
+        assert not (out_dir / "decoder.onnx.data").exists()
+
+    def test_uses_fallback_entry_point_when_requested_one_missing(self, tmp_path):
+        src_root = _make_nested_genai_package(
+            tmp_path / "src",
+            {"vision": "vision_encoder/model.onnx", "embedding": "embedding/model.onnx"},
+        )
+        composite = _make_composite_handler(
+            src_root,
+            {
+                "vision_encoder": "vision_encoder/model.onnx",
+                "embedding": "embedding/model.onnx",
+            },
+        )
+
+        # The default entry_point_component is "decoder", which doesn't exist here.
+        p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True)
+        out = p.run(composite, str(tmp_path / "out"))
+
+        assert Path(out.model_path).name in {"vision_encoder.onnx", "embedding.onnx"}
+
+    def test_honors_explicit_entry_point_component(self, tmp_path):
+        src_root = _make_nested_genai_package(
+            tmp_path / "src",
+            {"decoder": "decoder/model.onnx", "embedding": "embedding/model.onnx"},
+        )
+        composite = _make_composite_handler(
+            src_root,
+            {
+                "decoder": "decoder/model.onnx",
+                "embedding": "embedding/model.onnx",
+            },
+        )
+
+        p = create_pass_from_dict(
+            CompositeToOnnxPackage,
+            {"entry_point_component": "embedding"},
+            disable_search=True,
+        )
+        out = p.run(composite, str(tmp_path / "out"))
+
+        assert Path(out.model_path).name == "embedding.onnx"
+
+    def test_rejects_package_without_genai_config(self, tmp_path):
+        src_root = tmp_path / "src"
+        src_root.mkdir()
+        _write_tiny_inline_onnx(src_root / "decoder" / "model.onnx")
+        composite = _make_composite_handler(src_root, {"decoder": "decoder/model.onnx"})
+
+        p = create_pass_from_dict(CompositeToOnnxPackage, {}, disable_search=True)
+        with pytest.raises(ValueError, match=r"genai_config\.json"):
+            p.run(composite, str(tmp_path / "out"))
+
+    def test_handles_unique_collision_in_subdir_names(self, tmp_path):
+        # Two components living in subdirs with the same internal filename shouldn't collide.
+        src_root = tmp_path / "src"
+        src_root.mkdir()
+        _write_tiny_inline_onnx(src_root / "model_a" / "model.onnx")
+        _write_tiny_inline_onnx(src_root / "model_b" / "model.onnx")
+        (src_root / "genai_config.json").write_text(
+            json.dumps(
+                {
+                    "model": {
+                        "first": {"filename": "model_a/model.onnx"},
+                        "second": {"filename": "model_b/model.onnx"},
+                    }
+                }
+            ),
+            encoding="utf-8",
+        )
+
+        composite = CompositeModelHandler(
+            model_components=[
+                ONNXModelHandler(model_path=str(src_root / "model_a" / "model.onnx")),
+                ONNXModelHandler(model_path=str(src_root / "model_b" / "model.onnx")),
+            ],
+            model_component_names=["first", "second"],
+            model_path=str(src_root),
+        )
+
+        p = create_pass_from_dict(
+            CompositeToOnnxPackage,
+            {"entry_point_component": "first"},
+            disable_search=True,
+        )
+        out = p.run(composite, str(tmp_path / "out"))
+
+        out_dir = Path(out.model_path).parent
+        assert (out_dir / "model_a.onnx").is_file()
+        assert (out_dir / "model_b.onnx").is_file()

From c958c04ad97fc2d8cc1ea0cbfd536775ad088ff3 Mon Sep 17 00:00:00 2001
From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com>
Date: Tue, 16 Jun 2026 08:40:38 +0000
Subject: [PATCH 3/6] mobiusbuilder: add EP override; lmms_ort: AudioDecoder +
 per-model processor args

- MobiusBuilder: add `mobius_ep_override` config knob. Lets a workflow force
  the mobius execution_provider (e.g. "default") independent of the Olive
  accelerator EP. Needed because mobius's cuda-EP attention fusions
  (PackedMultiHeadAttention for Qwen2.5-VL vision, GQA for Gemma-4 decoder)
  produce graphs the ORT-GenAI fused-attention kernels reject. "default" EP
  skips those fusions; the resulting INT4 graph is numerically equivalent.
- lmms_ort: support torchcodec.AudioDecoder visuals (HF datasets 5.x audio
  feature) in _normalize_audio via duck-typed get_all_samples().
- lmms_ort: branch processor-arg shape on model type - Phi-4-MM needs a bare
  string, Whisper needs [prompt]. Passing a list to Phi-4-MM raised
  "Number of image tokens does not match the number of images".

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 olive/evaluator/lmms_ort.py               | 31 ++++++++++++++++++++---
 olive/passes/onnx/mobius_model_builder.py | 30 ++++++++++++++++++++++
 2 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/olive/evaluator/lmms_ort.py b/olive/evaluator/lmms_ort.py
index 635f20cf70..b4ef57f473 100644
--- a/olive/evaluator/lmms_ort.py
+++ b/olive/evaluator/lmms_ort.py
@@ -111,6 +111,22 @@ def _normalize_audio(visual) -> tuple[np.ndarray, int] | None:
             return _load_audio_file(Path(visual["path"]))
     if isinstance(visual, (str, Path)):
         return _load_audio_file(Path(visual))
+    # torchcodec.decoders.AudioDecoder — HF datasets 5.x returns this for the
+    # "audio" feature instead of the legacy {"array", "sampling_rate"} dict.
+    # Detect by duck-typing the get_all_samples() method to avoid a hard
+    # torchcodec import (it's an optional install).
+    if hasattr(visual, "get_all_samples"):
+        try:
+            samples = visual.get_all_samples()
+            # samples.data is a torch.Tensor of shape [channels, num_samples].
+            # ORT-GenAI's processor wants mono float32; downmix if multichannel.
+            arr = samples.data.detach().cpu().numpy().astype(np.float32)
+            if arr.ndim == 2:
+                arr = arr.mean(axis=0)
+            return arr, int(samples.sample_rate)
+        except Exception as e:  # pragma: no cover
+            logger.warning("Failed to decode AudioDecoder visual: %s", e)
+            return None
     return None
 
 
@@ -380,10 +396,17 @@ def _run_generation(
             og_audios = self._build_og_audios(audios, tmp_dir)
 
             try:
-                # ORT-GenAI processors accept either a bare string or a list of
-                # strings depending on backend; benchmark_multimodal.py wraps in
-                # a list, which matches both the whisper and phi4mm paths.
-                inputs = self._processor([prompt], images=og_images, audios=og_audios)
+                # ORT-GenAI multimodal processors disagree on argument shape:
+                #   - Phi-4-MM expects a bare string. Passing [prompt] raises
+                #     "Number of image tokens does not match the number of images"
+                #     because the processor interprets the list as one prompt per
+                #     image (verified against pre-built phi4mm INT4 package).
+                #   - Whisper's processor (per ORT-GenAI's reference
+                #     benchmark_multimodal.py) is exercised with a list of
+                #     prompts.
+                # Branch on model type rather than guess.
+                processor_input = [prompt] if self._model_type == "whisper" else prompt
+                inputs = self._processor(processor_input, images=og_images, audios=og_audios)
             except Exception as e:  # pragma: no cover
                 del generator
                 return self._handle_error("ORT-GenAI multimodal processor failed.", e, "")
diff --git a/olive/passes/onnx/mobius_model_builder.py b/olive/passes/onnx/mobius_model_builder.py
index e2d10fc0c4..df11cc4ca4 100644
--- a/olive/passes/onnx/mobius_model_builder.py
+++ b/olive/passes/onnx/mobius_model_builder.py
@@ -91,6 +91,25 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "quantization pass (e.g. OnnxMatMulNBits) after this pass."
                 ),
             ),
+            "mobius_ep_override": PassConfigParam(
+                type_=str,
+                required=False,
+                default_value=None,
+                description=(
+                    "Override the mobius execution_provider regardless of the "
+                    "Olive accelerator EP. Useful as a workaround when mobius's "
+                    "EP-specific attention fusions produce graphs that the "
+                    "ORT-GenAI fused-attention kernels reject at runtime. For "
+                    "example, the mobius cuda EP fuses PackedMultiHeadAttention "
+                    "in the Qwen2.5-VL vision encoder (whose internally-computed "
+                    "cumulative_sequence_length does not satisfy the kernel's "
+                    "expected shape) and GroupQueryAttention in the Gemma-4 "
+                    "decoder (where the bidirectional vision-block mask makes GQA "
+                    "invalid). Set to 'default' to skip all EP-specific fusions. "
+                    "The resulting INT4 graph is numerically equivalent; only "
+                    "perf-related fusions are dropped."
+                ),
+            ),
         }
 
     def _run_for_config(
@@ -120,6 +139,17 @@ def _run_for_config(
                 self.accelerator_spec.accelerator_type,
             )
 
+        # Honor the explicit override. Logged at WARNING so the workaround is
+        # visible in run output.
+        if config.mobius_ep_override is not None:
+            logger.warning(
+                "MobiusBuilder: mobius_ep_override set; overriding mobius EP "
+                "from '%s' to '%s' (auto-derived EP discarded).",
+                ep_str,
+                config.mobius_ep_override,
+            )
+            ep_str = config.mobius_ep_override
+
         dtype_str: str = _PRECISION_TO_DTYPE.get(config.precision, "f32")
         model_id: str = model.model_name_or_path
 

From ff38a07da735e128430b75c1da66f398ec1b8efe Mon Sep 17 00:00:00 2001
From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com>
Date: Thu, 18 Jun 2026 19:20:01 +0000
Subject: [PATCH 4/6] remove mobius_ep_override (obsolete)

---
 olive/passes/onnx/mobius_model_builder.py | 30 -----------------------
 1 file changed, 30 deletions(-)

diff --git a/olive/passes/onnx/mobius_model_builder.py b/olive/passes/onnx/mobius_model_builder.py
index df11cc4ca4..e2d10fc0c4 100644
--- a/olive/passes/onnx/mobius_model_builder.py
+++ b/olive/passes/onnx/mobius_model_builder.py
@@ -91,25 +91,6 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
                     "quantization pass (e.g. OnnxMatMulNBits) after this pass."
                 ),
             ),
-            "mobius_ep_override": PassConfigParam(
-                type_=str,
-                required=False,
-                default_value=None,
-                description=(
-                    "Override the mobius execution_provider regardless of the "
-                    "Olive accelerator EP. Useful as a workaround when mobius's "
-                    "EP-specific attention fusions produce graphs that the "
-                    "ORT-GenAI fused-attention kernels reject at runtime. For "
-                    "example, the mobius cuda EP fuses PackedMultiHeadAttention "
-                    "in the Qwen2.5-VL vision encoder (whose internally-computed "
-                    "cumulative_sequence_length does not satisfy the kernel's "
-                    "expected shape) and GroupQueryAttention in the Gemma-4 "
-                    "decoder (where the bidirectional vision-block mask makes GQA "
-                    "invalid). Set to 'default' to skip all EP-specific fusions. "
-                    "The resulting INT4 graph is numerically equivalent; only "
-                    "perf-related fusions are dropped."
-                ),
-            ),
         }
 
     def _run_for_config(
@@ -139,17 +120,6 @@ def _run_for_config(
                 self.accelerator_spec.accelerator_type,
             )
 
-        # Honor the explicit override. Logged at WARNING so the workaround is
-        # visible in run output.
-        if config.mobius_ep_override is not None:
-            logger.warning(
-                "MobiusBuilder: mobius_ep_override set; overriding mobius EP "
-                "from '%s' to '%s' (auto-derived EP discarded).",
-                ep_str,
-                config.mobius_ep_override,
-            )
-            ep_str = config.mobius_ep_override
-
         dtype_str: str = _PRECISION_TO_DTYPE.get(config.precision, "f32")
         model_id: str = model.model_name_or_path
 

From c13a68340aa1d1983888d314769648d98dfb36df Mon Sep 17 00:00:00 2001
From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com>
Date: Fri, 19 Jun 2026 17:59:19 +0000
Subject: [PATCH 5/6] evaluator: drop lmms-eval entry point to minimize
 packaging surface

Remove the setup.py lmms_eval.models entry point, the _model_manifest
factory, and its registration tests. The Olive LMMSEvaluator path imports
LMMSORTGenAIEvaluator directly, so the entry point only affected the
standalone lmms-eval CLI; dropping it keeps setup.py out of this change.
---
 olive/evaluator/lmms_ort.py     | 26 -------------------------
 setup.py                        |  3 ---
 test/evaluator/test_lmms_ort.py | 34 ---------------------------------
 3 files changed, 63 deletions(-)

diff --git a/olive/evaluator/lmms_ort.py b/olive/evaluator/lmms_ort.py
index b4ef57f473..3ee00fd685 100644
--- a/olive/evaluator/lmms_ort.py
+++ b/olive/evaluator/lmms_ort.py
@@ -629,29 +629,3 @@ def loglikelihood(self, requests: list[Instance], disable_tqdm: bool = False) ->
 
     def generate_until_multi_round(self, requests) -> list[str]:
         raise NotImplementedError("ortgenai_mm does not support lmms-eval multi-round generation yet.")
-
-
-# -----------------------------------------------------------------------------
-# lmms-eval MODEL_REGISTRY_V2 entry-point factory.
-#
-# Exposed via setup.py entry_points["lmms_eval.models"], so a fresh install of
-# olive-ai makes ``--model ortgenai_mm`` discoverable from the lmms-eval CLI
-# (e.g. ``python -m lmms_eval --model ortgenai_mm ...``) without requiring the
-# caller to import this module first.
-#
-# lmms-eval's ``ModelRegistryV2.load_entrypoint_manifests`` accepts a
-# ``Callable`` payload, so we keep the import of ``ModelManifest`` lazy. That
-# way ``olive`` (and the rest of this module) stays importable when lmms-eval
-# is not installed.
-# -----------------------------------------------------------------------------
-def _model_manifest():
-    """Return the lmms-eval ModelManifest for ``ortgenai_mm``.
-
-    Used as an entry-point payload for lmms-eval's MODEL_REGISTRY_V2.
-    """
-    from lmms_eval.models.registry_v2 import ModelManifest
-
-    return ModelManifest(
-        model_id="ortgenai_mm",
-        simple_class_path="olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator",
-    )
diff --git a/setup.py b/setup.py
index 798010301d..b4aebf070a 100644
--- a/setup.py
+++ b/setup.py
@@ -88,8 +88,5 @@ def get_extra_deps(rel_path):
     data_files=[],
     entry_points={
         "console_scripts": ["olive=olive.cli.launcher:main"],
-        "lmms_eval.models": [
-            "ortgenai_mm = olive.evaluator.lmms_ort:_model_manifest",
-        ],
     },
 )
diff --git a/test/evaluator/test_lmms_ort.py b/test/evaluator/test_lmms_ort.py
index 595b158224..630b444cba 100644
--- a/test/evaluator/test_lmms_ort.py
+++ b/test/evaluator/test_lmms_ort.py
@@ -19,7 +19,6 @@
 from olive.evaluator.lmms_ort import (
     LMMSORTGenAIEvaluator,
     _build_prompt,
-    _model_manifest,
     _normalize_execution_provider,
 )
 from olive.evaluator.olive_evaluator import LMMSEvaluator
@@ -519,39 +518,6 @@ def test_lmms_ort_genai_evaluator_is_simple_flag_matches_registration():
     assert LMMSORTGenAIEvaluator.is_simple is True
 
 
-def test_model_manifest_factory_returns_expected_manifest():
-    """Verify the entry-point payload points at LMMSORTGenAIEvaluator."""
-    pytest.importorskip("lmms_eval")
-
-    manifest = _model_manifest()
-
-    assert manifest.model_id == "ortgenai_mm"
-    assert manifest.simple_class_path == "olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator"
-    assert manifest.chat_class_path is None
-
-
-def test_olive_ai_registers_ortgenai_mm_entry_point():
-    """Verify olive-ai exposes ortgenai_mm via the lmms_eval.models entry-point group."""
-    from importlib.metadata import entry_points
-
-    eps = {ep.name: ep.value for ep in entry_points(group="lmms_eval.models")}
-    assert eps.get("ortgenai_mm") == "olive.evaluator.lmms_ort:_model_manifest"
-
-
-def test_model_registry_v2_resolves_ortgenai_mm():
-    """Verify lmms-eval's MODEL_REGISTRY_V2 resolves ortgenai_mm via the entry point."""
-    pytest.importorskip("lmms_eval")
-    from lmms_eval.models import MODEL_REGISTRY_V2
-
-    resolved = MODEL_REGISTRY_V2.resolve("ortgenai_mm")
-    assert resolved.model_id == "ortgenai_mm"
-    assert resolved.model_type == "simple"
-    assert resolved.class_path == "olive.evaluator.lmms_ort.LMMSORTGenAIEvaluator"
-
-    cls = MODEL_REGISTRY_V2.get_model_class("ortgenai_mm")
-    assert cls is LMMSORTGenAIEvaluator
-
-
 # -----------------------------------------------------------------------------
 # Visual partitioning
 # -----------------------------------------------------------------------------

From 10eee66f6a21de242f255ff370e41951b742a8d9 Mon Sep 17 00:00:00 2001
From: Delwin Kim <139003345+DelwinKim@users.noreply.github.com>
Date: Tue, 23 Jun 2026 17:25:57 +0000
Subject: [PATCH 6/6] evaluator: auto-detect structured-content chat template
 support via probe

---
 olive/evaluator/lmms_ort.py        | 113 ++++++++++++++++++++++-------
 olive/evaluator/olive_evaluator.py |  17 ++---
 test/evaluator/test_lmms_ort.py    |  87 +++++++++++++++++++++-
 3 files changed, 182 insertions(+), 35 deletions(-)

diff --git a/olive/evaluator/lmms_ort.py b/olive/evaluator/lmms_ort.py
index 3ee00fd685..03a0e0bbae 100644
--- a/olive/evaluator/lmms_ort.py
+++ b/olive/evaluator/lmms_ort.py
@@ -247,8 +247,8 @@ def __init__(
         provider_options: dict | None = None,
         fail_on_error: bool = True,
         prompt_template: str | None = None,
-        image_token_format: str = "<|image_{index}|>",
-        audio_token_format: str = "<|audio_{index}|>",
+        image_token_format: str | None = None,
+        audio_token_format: str | None = None,
         **kwargs,
     ) -> None:
         if _LMMS_EVAL_IMPORT_ERROR is not None:
@@ -325,6 +325,14 @@ def __init__(
         except json.JSONDecodeError as e:
             raise ValueError(f"Invalid genai_config.json in {self.model_dir}") from e
 
+        # Probe (once) whether this model's chat template injects media tokens
+        # from structured content parts (``{"type": "image"}``). Well-behaved
+        # templates (Gemma-4, Qwen2.5-VL, Qwen3-VL) do; Phi-4-MM's template
+        # stringifies the content list as Python repr instead. When supported,
+        # the adapter lets the template emit the correct per-model media tokens
+        # automatically, so the user does not need to set ``image_token_format``.
+        self._supports_structured_content = self._probe_structured_content_support()
+
         self._rank = 0
         self._world_size = 1
         logger.info("Model loaded. Model type: %s", self._model_type)
@@ -518,28 +526,72 @@ def _score_continuation(self, prompt: str, continuation: str, images, audios) ->
         del generator
         return total_logprob, all_greedy
 
+    _DEFAULT_IMAGE_TOKEN_FORMAT = "<|image_{index}|>"
+    _DEFAULT_AUDIO_TOKEN_FORMAT = "<|audio_{index}|>"
+
+    def _probe_structured_content_support(self) -> bool:
+        """Detect whether the model's chat template injects media tokens from structured content.
+
+        Renders a probe message whose content is a list of typed parts
+        (``[{"type": "image"}, {"type": "text", ...}]``) and checks the result:
+
+        - Well-behaved templates (Gemma-4, Qwen2.5-VL, Qwen3-VL) replace the
+          image part with the model's own media token (e.g. ``<|image|>`` or
+          ``<|vision_start|>...``), so the rendered string contains no Python
+          dict repr.
+        - Broken templates (Phi-4-MM) stringify the list as Python repr, so the
+          rendered string contains ``{'type'`` / ``"type"``.
+
+        Probed once at load. Returns False if the tokenizer has no
+        ``apply_chat_template`` or the probe raises.
+        """
+        if not self._has_chat_template:
+            return False
+        try:
+            probe = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "x"}]}]
+            rendered = self._tokenizer.apply_chat_template(json.dumps(probe), add_generation_prompt=True)
+        except Exception as e:  # pragma: no cover - defensive
+            logger.debug("Structured-content probe failed; falling back to pre-render: %s", e)
+            return False
+        # A broken template leaks the Python dict repr of the content parts.
+        return "{'type'" not in rendered and '"type"' not in rendered
+
+    def _build_structured_chat_prompt(self, user_text: str, num_images: int, num_audios: int) -> str:
+        """Build the prompt via structured content parts so the template injects media tokens.
+
+        Only used when :meth:`_probe_structured_content_support` returned True
+        and the user did not override the media token formats.
+        """
+        content: list[dict[str, Any]] = []
+        content.extend({"type": "image"} for _ in range(num_images))
+        content.extend({"type": "audio"} for _ in range(num_audios))
+        content.append({"type": "text", "text": user_text})
+
+        messages: list[dict[str, Any]] = []
+        if self.system_prompt:
+            messages.append({"role": "system", "content": self.system_prompt})
+        messages.append({"role": "user", "content": content})
+        return self._tokenizer.apply_chat_template(json.dumps(messages), add_generation_prompt=True)
+
     def _build_prompt_for_request(self, user_text: str, num_images: int, num_audios: int) -> str:
         """Build the final prompt string fed to ``og.MultiModalProcessor``.
 
-        Default path: pre-render image/audio markers into the user content
-        string using ``image_token_format`` / ``audio_token_format``, then call
-        ``og.Tokenizer.apply_chat_template`` to add the model-specific chat
-        scaffolding (system/user/assistant turn markers).
-
-        Pure content-parts (``{"type": "image"}``) is what PR #2488 and the
-        olive-recipes Qwen2.5-VL eval scripts do, and it works for chat
-        templates that understand structured content (Qwen2.5-VL, Qwen3-VL,
-        Gemma-4). However, Phi-4-MM's chat template stringifies content lists
-        as Python repr (verified: produces
-        ``<|user|>[{'type': 'image'}, ...]<|end|>`` instead of injecting
-        ``<|image_1|>``). Pre-rendering the markers ourselves before
-        ``apply_chat_template`` works for both conventions, since templates
-        that just pass through user content render identically either way.
-
-        Fallback path: ``_build_prompt`` legacy format-string. Used when the
-        user has explicitly set ``prompt_template`` in the evaluator config
-        (to override per-benchmark) or when the underlying onnxruntime-genai
-        version predates ``apply_chat_template`` on ``og.Tokenizer``.
+        Path selection:
+
+        1. **Whisper**: no chat template; return the decoder-start token sequence.
+        2. **Explicit override / no chat template**: legacy ``_build_prompt``
+           format-string path (when the user set ``prompt_template`` or the
+           onnxruntime-genai version predates ``apply_chat_template``).
+        3. **Structured content** (preferred): when the model's chat template
+           injects media tokens from structured content parts (auto-detected by
+           :meth:`_probe_structured_content_support`) and the user did not set
+           ``image_token_format`` / ``audio_token_format``. The template emits
+           the correct per-model media tokens (e.g. ``<|image|>`` for Gemma-4,
+           ``<|vision_start|>...`` for Qwen2.5-VL) — no per-model config needed.
+        4. **Pre-render** (fallback): pre-render media markers into a flat user
+           string, then ``apply_chat_template``. Used for templates that
+           stringify structured content (Phi-4-MM) or when the user explicitly
+           supplies a media token format.
         """
         if self._model_type == "whisper":
             # Whisper has no chat template; the "prompt" is just the decoder-start
@@ -555,12 +607,23 @@ def _build_prompt_for_request(self, user_text: str, num_images: int, num_audios:
                 user_text,
                 self.system_prompt,
                 self.prompt_template,
-                self.image_token_format,
-                self.audio_token_format,
+                self.image_token_format or self._DEFAULT_IMAGE_TOKEN_FORMAT,
+                self.audio_token_format or self._DEFAULT_AUDIO_TOKEN_FORMAT,
             )
 
-        image_markers = "".join(_format_media_tokens(num_images, self.image_token_format))
-        audio_markers = "".join(_format_media_tokens(num_audios, self.audio_token_format))
+        # Prefer structured content when the template supports it AND the user
+        # did not pin a specific media token format. This lets well-behaved
+        # templates inject their own correct tokens without per-model config.
+        user_pinned_tokens = self.image_token_format is not None or self.audio_token_format is not None
+        if self._supports_structured_content and not user_pinned_tokens:
+            return self._build_structured_chat_prompt(user_text, num_images, num_audios)
+
+        image_markers = "".join(
+            _format_media_tokens(num_images, self.image_token_format or self._DEFAULT_IMAGE_TOKEN_FORMAT)
+        )
+        audio_markers = "".join(
+            _format_media_tokens(num_audios, self.audio_token_format or self._DEFAULT_AUDIO_TOKEN_FORMAT)
+        )
         user_content = f"{image_markers}{audio_markers}{user_text}"
 
         messages: list[dict[str, Any]] = []
diff --git a/olive/evaluator/olive_evaluator.py b/olive/evaluator/olive_evaluator.py
index ee7d63286e..d2f27c07d6 100644
--- a/olive/evaluator/olive_evaluator.py
+++ b/olive/evaluator/olive_evaluator.py
@@ -2082,15 +2082,14 @@ def __init__(self, tasks: list[str], **kwargs):
         self.output_path = kwargs.get("output_path")
         self.fail_on_error = bool(kwargs.get("fail_on_error", True))
         self.prompt_template = kwargs.get("prompt_template")
-        self.image_token_format = kwargs.get("image_token_format", "<|image_{index}|>")
-        self.audio_token_format = kwargs.get("audio_token_format", "<|audio_{index}|>")
-        # NOTE: ``prompt_template`` / ``image_token_format`` / ``audio_token_format``
-        # are legacy format-string knobs and should rarely be needed. By default
-        # the ortgenai_mm adapter calls ``og.Tokenizer.apply_chat_template`` (same
-        # path used by PR #2488 and olive-recipes eval scripts), which reads the
-        # package's ``chat_template.jinja`` and produces the correct chat format
-        # for every supported model automatically. Setting ``prompt_template``
-        # forces the adapter into the legacy hand-templated path.
+        # Default to None (auto): the ortgenai_mm adapter probes the model's chat
+        # template once at load and, when it injects media tokens from structured
+        # content parts (Gemma-4, Qwen2.5-VL, Qwen3-VL), emits the correct
+        # per-model token automatically — no override needed. Set these only to
+        # force a specific media token format (e.g. for a template that
+        # stringifies structured content, like Phi-4-MM).
+        self.image_token_format = kwargs.get("image_token_format")
+        self.audio_token_format = kwargs.get("audio_token_format")
         # HF-only knobs (forwarded to lmms-eval's native wrapper if present).
         # ``trust_remote_code`` defaults to False to match the rest of Olive
         # (e.g. olive/common/hf/utils.py, olive/data/component/load_dataset.py)
diff --git a/test/evaluator/test_lmms_ort.py b/test/evaluator/test_lmms_ort.py
index 630b444cba..3bd52104d6 100644
--- a/test/evaluator/test_lmms_ort.py
+++ b/test/evaluator/test_lmms_ort.py
@@ -75,6 +75,7 @@ def _make_evaluator_for_prompt_tests():
     inst.image_token_format = "<|image_{index}|>"
     inst.audio_token_format = "<|audio_{index}|>"
     inst._has_chat_template = True
+    inst._supports_structured_content = False
     return inst
 
 
@@ -145,6 +146,90 @@ def test_build_prompt_for_request_falls_back_when_chat_template_unavailable():
     inst._tokenizer.apply_chat_template.assert_not_called()
 
 
+def test_build_prompt_for_request_uses_structured_content_when_supported_and_not_pinned():
+    """Use structured content parts when supported and the user did not pin a token format."""
+    inst = _make_evaluator_for_prompt_tests()
+    inst._supports_structured_content = True
+    inst.image_token_format = None  # auto
+    inst.audio_token_format = None  # auto
+    inst._tokenizer.apply_chat_template.return_value = "<rendered>"
+
+    inst._build_prompt_for_request("What is this?", num_images=1, num_audios=1)
+
+    import json as _json
+
+    messages = _json.loads(inst._tokenizer.apply_chat_template.call_args.args[0])
+    user_msg = messages[-1]
+    assert user_msg["role"] == "user"
+    # Content is a list of typed parts, NOT a pre-rendered string.
+    assert user_msg["content"] == [
+        {"type": "image"},
+        {"type": "audio"},
+        {"type": "text", "text": "What is this?"},
+    ]
+
+
+def test_build_prompt_for_request_pre_renders_when_token_format_pinned():
+    """An explicit image_token_format forces the pre-render path despite structured support."""
+    inst = _make_evaluator_for_prompt_tests()
+    inst._supports_structured_content = True
+    inst.image_token_format = "<|vision_start|>"  # user pinned
+    inst.audio_token_format = None
+    inst._tokenizer.apply_chat_template.return_value = "<rendered>"
+
+    inst._build_prompt_for_request("Q", num_images=1, num_audios=0)
+
+    import json as _json
+
+    messages = _json.loads(inst._tokenizer.apply_chat_template.call_args.args[0])
+    # Pre-rendered flat string, not structured parts.
+    assert messages[-1]["content"] == "<|vision_start|>Q"
+
+
+def test_build_prompt_for_request_pre_renders_when_structured_unsupported():
+    """Fall back to pre-rendering when the template stringifies structured content (Phi-4-MM)."""
+    inst = _make_evaluator_for_prompt_tests()
+    inst._supports_structured_content = False
+    inst.image_token_format = None  # auto
+    inst.audio_token_format = None
+    inst._tokenizer.apply_chat_template.return_value = "<rendered>"
+
+    inst._build_prompt_for_request("Q", num_images=1, num_audios=0)
+
+    import json as _json
+
+    messages = _json.loads(inst._tokenizer.apply_chat_template.call_args.args[0])
+    # Falls back to default Phi-4-MM-style pre-rendered markers.
+    assert messages[-1]["content"] == "<|image_1|>Q"
+
+
+def test_probe_structured_content_support_detects_injection():
+    """A template that injects the media token (no dict repr) -> supported."""
+    inst = LMMSORTGenAIEvaluator.__new__(LMMSORTGenAIEvaluator)
+    inst._has_chat_template = True
+    inst._tokenizer = MagicMock()
+    inst._tokenizer.apply_chat_template.return_value = "<|im_start|>user\n<|image|>x<|im_end|>"
+
+    assert inst._probe_structured_content_support() is True
+
+
+def test_probe_structured_content_support_detects_stringified_repr():
+    """A template that leaks the Python dict repr -> not supported."""
+    inst = LMMSORTGenAIEvaluator.__new__(LMMSORTGenAIEvaluator)
+    inst._has_chat_template = True
+    inst._tokenizer = MagicMock()
+    inst._tokenizer.apply_chat_template.return_value = "<|user|>[{'type': 'image'}, ...]<|end|>"
+
+    assert inst._probe_structured_content_support() is False
+
+
+def test_probe_structured_content_support_false_when_no_chat_template():
+    inst = LMMSORTGenAIEvaluator.__new__(LMMSORTGenAIEvaluator)
+    inst._has_chat_template = False
+
+    assert inst._probe_structured_content_support() is False
+
+
 def test_lmms_evaluator_converts_lmms_results(tmp_path):
     model_dir = tmp_path / "model"
     model_dir.mkdir()
@@ -201,7 +286,7 @@ def test_lmms_evaluator_converts_lmms_results(tmp_path):
         fail_on_error=False,
         prompt_template="{user_content}",
         image_token_format="<image>",
-        audio_token_format="<|audio_{index}|>",
+        audio_token_format=None,
     )
     simple_evaluate_mock.assert_called_once()
     assert result.get_value("ai2d_lite", "exact_match") == 0.5