[Benchmark] Add support for MMOral-OPG-Open benchmark (#1484)

isjinghao · Copilot · mzr1996 · web-flow · commit 589fe36fac44 · 2026-03-27T10:48:00.000+08:00
* [Benchmark] Add support for MMOral-OPG-Closed

* Prepare MMOral OPG utils and dataset wiring

* [Benchmark] Add support for MMOral-OPG-OPEN benchmark

* fix

* Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI &lt;175728472+Copilot@users.noreply.github.com&gt;

* Fix pre-commit/flake8 formatting

* Fix flake8 issues in mmoral_opg dataset utils

* Fix flake8/isort for MMOral-OPG open dataset

---------

Co-authored-by: Copilot Autofix powered by AI &lt;175728472+Copilot@users.noreply.github.com&gt;
Co-authored-by: Ma Zerun &lt;mzr1996@163.com&gt;
diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
@@ -83,6 +83,7 @@
 from .mmifeval import MMIFEval
 from .mmlongbench import MMLongBench
 from .mmmath import MMMath
+from .mmoral_opg_open import MMOral_OPG_OPEN
 from .mmsafetybench import MMSafetyBenchDataset
 from .mmsibench import MMSIBench, MMSIVideoBench
 from .moat import MOAT
@@ -287,7 +288,7 @@ def evaluate(self, eval_file, **judge_kwargs):
     UniSVG, SArena, VLMsAreBiased, MMESCIDataset, CoreCognition, GroundingME,
     FoxBench, VTCBench, Asclepius, PlotQA, ChartX, ChartBench, ChartCapDataset, WorldVQA, PuzzleVQA, VisualPuzzles,
     MMSafetyBenchDataset, MSSBenchDataset, SIUODataset, SIUOGenDataset, SIUOMCQDataset, M3oralBenchDataset,
-    Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning
+    Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning, MMOral_OPG_OPEN
 ]
 
 # add by EASI team
diff --git a/vlmeval/dataset/mmoral_opg_open.py b/vlmeval/dataset/mmoral_opg_open.py
@@ -0,0 +1,117 @@
+import os
+import os.path as osp
+
+from ..smp import decode_base64_to_image_file, dump, load, read_ok
+from ..utils import track_progress_rich
+from .image_base import ImageBaseDataset
+from .utils import DEBUG_MESSAGE, build_judge
+
+
+class MMOralBase(ImageBaseDataset):
+    """Shared image-dumping logic for MMOral-OPG benchmarks."""
+
+    def dump_image(self, line):
+        os.makedirs(self.img_root, exist_ok=True)
+
+        tgt_path_z = []
+        if isinstance(line['image'], list):
+            for i in range(len(line['image'])):
+                tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'][i], tgt_path)
+                tgt_path_z.append(tgt_path)
+        else:
+            tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+            if not read_ok(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+            tgt_path_z.append(tgt_path)
+        return tgt_path_z
+
+
+class MMOral_OPG_OPEN(MMOralBase):
+    """Open-ended MMOral-OPG benchmark (VQA)."""
+
+    TYPE = 'VQA'
+
+    DATASET_URL = {
+        'MMOral_OPG_OPEN': 'https://huggingface.co/datasets/OralGPT/MMOral-OPG-Bench/resolve/main/MMOral-OPG-Bench-Open-Ended.tsv'  # noqa: E501
+    }
+
+    DATASET_MD5 = {
+        'MMOral_OPG_OPEN': 'd328b1b527ef7467b328d8b35d5f8155'
+    }
+
+    def build_prompt(self, line):
+        if isinstance(line, int):
+            line = self.data.iloc[line]
+
+        tgt_path = self.dump_image(line)
+        question = line['question']
+        prompt = (
+            f'Question: {question}\n'
+            'Please provide a detailed and accurate answer to the question.'
+        )
+
+        msgs = []
+        if isinstance(tgt_path, list):
+            msgs.extend([dict(type='image', value=p) for p in tgt_path])
+        else:
+            msgs = [dict(type='image', value=tgt_path)]
+        msgs.append(dict(type='text', value=prompt))
+
+        return msgs
+
+    @classmethod
+    def evaluate(cls, eval_file, **judge_kwargs):
+        """Evaluation with LLM-as-a-judge for open-ended answers."""
+        from .utils.mmoral_opg import MMOral_opg_acc, MMOral_opg_auxeval
+
+        suffix = eval_file.split('.')[-1]
+        # Some call sites may not explicitly set `judge_kwargs['model']`,
+        # so we fall back to a default name for the judge model.
+        judge_model_name = judge_kwargs.pop('model', 'mmoral-opg-judge')
+        storage = eval_file.replace(f'.{suffix}', f'_{judge_model_name}.xlsx')
+        tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_model_name}.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+
+        if not osp.exists(storage):
+            data = load(eval_file)
+            model = build_judge(model=judge_model_name, max_tokens=16384, **judge_kwargs)
+            assert model.working(), (
+                'MMOral-Open-ended evaluation requires a working OPENAI API\n'
+                + DEBUG_MESSAGE
+            )
+
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+
+            ans = load(tmp_file) if osp.exists(tmp_file) else {}
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+
+            if len(indices):
+                new_results = track_progress_rich(
+                    MMOral_opg_auxeval,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file,
+                )
+                ans = load(tmp_file)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
+
+            data['score'] = [ans[idx]['score'] for idx in data['index']]
+            data['log'] = [ans[idx]['log'] for idx in data['index']]
+            dump(data, storage)
+
+        score, score_fine = MMOral_opg_acc(storage)
+        score_pth = storage.replace('.xlsx', '_score.csv')
+        score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
+        dump(score, score_pth)
+        dump(score_fine, score_fine_pth)
+        return score
diff --git a/vlmeval/dataset/utils/mmoral_opg.py b/vlmeval/dataset/utils/mmoral_opg.py
@@ -0,0 +1,122 @@
+from collections import defaultdict
+
+import pandas as pd
+
+from ...smp.file import load
+
+# for MMOral-OPG-Bench
+
+
+def build_mmoral_opg_gpt4_prompt(line):
+    question = line['question']
+    gt = str(line['answer'])
+    prediction = str(line['prediction'])
+    # Keep this prompt readable and flake8-friendly (avoid overly long lines).
+    prompt = """
+Given the question, compare the ground truth and prediction from AI
+models, to generate a correctness score for the prediction.
+The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5,
+0.6, 0.7, 0.8, 0.9, or 1.0 (totally right).
+Just complete the last space of the correctness score.
+
+Question | Ground truth | Prediction | Correctness
+--- | --- | --- | ---
+How many teeth are visualized in the radiograph? | 30 teeth are visualized with clear anatomical
+definition. | 30 | 1.0
+How many teeth are visualized in the radiograph? | 30 teeth are visualized with clear anatomical
+definition. | 29 teeth are visualized with clear anatomical definition. | 0.0
+What is the status of the wisdom teeth in the radiograph? | Three
+wisdom teeth are detected, all of which are impacted: #18, #28, and #48.
+| #18: impacted, #28: impacted, #48: erupted | 0.7
+What is the condition of the teeth #26 and #14? | Teeth #26 and #14
+show signs of periapical abscesses. | Teeth #26 and #23 show signs
+of periapical abscesses. | 0.5
+What is the condition of the bone architecture and visible structures in
+the jaw? | No apparent bone loss is observed. Bilateral mandibular
+canals and maxillary sinuses are clearly visible. | Bilateral
+mandibular canals and maxillary sinuses are clearly visible. | 0.5
+What is the clinical priority concern regarding the periapical lesions?
+| Periapical cysts at #11 and #12, and granuloma at #46 require
+endodontic evaluation. | Periapical lesions at #11, #12, and #46
+require endodontic evaluation. | 0.8
+What radiographic features are visible in tooth #31 on the panoramic X-ray? | [\n
+{\"Teeth position\": {\"point_2d\": [1242, 726]}},\n
+{\"Crown\": {\"box_2d\": [1220, 637, 1266, 741]}}\n
+] | Crown | 0.8
+What radiographic features are visible in tooth #31 on the panoramic X-ray? | [\n
+{\"Teeth position\": {\"point_2d\": [1242, 726]}},\n
+{\"Crown\": {\"box_2d\": [1220, 637, 1266, 741]}}\n
+] | Crown at position: [1230, 627, 1276, 750] | 0.9
+What radiographic features are visible in tooth #31 on the panoramic X-ray? | [\n
+{\"Teeth position\": {\"point_2d\": [1242, 726]}},\n
+{\"Crown\": {\"box_2d\": [1220, 637, 1266, 741]}}\n
+] | Teeth at position: {\"point_2d\": [1242, 726]}},\n
+{Crown at position: {\"box_2d\": [1230, 627, 1276, 750]}} | 1.0
+"""
+    gpt4_prompt = prompt + '\n' + ' | '.join(
+        [question, gt.replace('<AND>', ' <AND> ').replace('<OR>', ' <OR> '), prediction, ''])
+    return gpt4_prompt
+
+
+def MMOral_opg_auxeval(model, line):
+    def float_cvt(s):
+        try:
+            return float(s)
+        except ValueError:
+            return None
+
+    prompt = build_mmoral_opg_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    for i in range(retry):
+        output = model.generate(prompt, temperature=i * 0.5)
+        score = float_cvt(output)
+        if score is None:
+            log += f'Try {i}: output is {output}, failed to parse.\n'
+        elif score < 0 or score > 1:
+            log += f'Try {i}: output is {output}, invalid score: {score}.\n'
+        else:
+            log += 'Succeed'
+            return dict(log=log, score=score)
+    log += 'All 5 retries failed.\n'
+    return dict(log=log, score=0.0)
+
+
+def MMOral_opg_acc(result_file):
+    data = load(result_file)
+    tot = defaultdict(lambda: 0)
+    score = defaultdict(lambda: 0)
+    lt = len(data)
+    cate2_list = []
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['category']
+        cate2 = cate.replace(',', '_')
+        if cate2 not in cate2_list:
+            cate2_list.append(cate2)
+        grade = float(item['score'])
+        cate_list = ['Teeth', 'Patho', 'HisT', 'Jaw', 'SumRec', 'Report']
+        for capa in cate_list:
+            if capa in cate:
+                tot[capa] += 1
+                score[capa] += grade
+        tot['Overall'] += 1
+        tot[cate2] += 1
+        score['Overall'] += grade
+        score[cate2] += grade
+
+    res = defaultdict(list)
+    res2 = defaultdict(list)
+    cate_list.append('Overall')
+    cate2_list.append('Overall')
+    for k in cate_list:
+        res['Category'].append(k)
+        res['tot'].append(tot[k])
+        res['acc'].append(score[k] / tot[k] * 100)
+    for v in cate2_list:
+        res2['Category'].append(v)
+        res2['tot'].append(tot[v])
+        res2['acc'].append(score[v] / tot[v] * 100)
+    res = pd.DataFrame(res)
+    res2 = pd.DataFrame(res2)
+    return res, res2