Skip to content

Commit a3656d4

Browse files
authored
[Benchmark] Add support for MMOral-OPG-Closed benchmark (#1483)
* [Benchmark] Add support for MMOral-OPG-Closed * [Lint] Fix flake8 issues in MMOral-OPG closed integration. Use explicit imports and formatting adjustments so lint checks pass without changing benchmark behavior. * [Lint] Align MMOral-OPG files with isort hook. Apply deterministic import ordering and spacing updates so pre-commit isort passes in CI.
1 parent 589fe36 commit a3656d4

3 files changed

Lines changed: 222 additions & 9 deletions

File tree

vlmeval/dataset/__init__.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
from .mmifeval import MMIFEval
8484
from .mmlongbench import MMLongBench
8585
from .mmmath import MMMath
86+
from .mmoral_opg_closed import MMOral_OPG_CLOSED
8687
from .mmoral_opg_open import MMOral_OPG_OPEN
8788
from .mmsafetybench import MMSafetyBenchDataset
8889
from .mmsibench import MMSIBench, MMSIVideoBench
@@ -266,11 +267,11 @@ def evaluate(self, eval_file, **judge_kwargs):
266267
# Add new supported dataset class here
267268
IMAGE_DATASET = [
268269
ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset,
269-
MathVision, LENS, MMMUDataset, OCRBench, MathVista, LLaVABench, LLaVABench_KO, VGRPBench, MMVet,
270+
MathVision, LENS, MMMUDataset, OCRBench, MathVista, LLaVABench, LLaVABench_KO, VGRPBench, MMVet, # noqa: E501
270271
MTVQADataset, TableVQABench, MMLongBench, VCRDataset, MMDUDataset, DUDE,
271272
SlideVQA, MUIRDataset, CCOCRDataset, GMAIMMBenchDataset, MMERealWorld,
272273
HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset, MIABench,
273-
OlympiadBench, SeePhys, WildVision, MMMath, QSpatial, Dynamath, GSM8KVDataset, MMGenBench, VizWiz,
274+
OlympiadBench, SeePhys, WildVision, MMMath, QSpatial, Dynamath, GSM8KVDataset, MMGenBench, VizWiz, # noqa: E501
274275
MMNIAH, CMMMU, VLRewardBench, WeMath, LogicVista, MMMUProDataset,
275276
CreationMMBenchDataset, ImageShortQADataset, MMAlignBench, OmniDocBench,
276277
VLM2Bench, VMCBenchDataset, EMMADataset, MME_CoT, MOAT, MedXpertQA_MM_test,
@@ -282,19 +283,19 @@ def evaluate(self, eval_file, **judge_kwargs):
282283
MMEReasoning, GOBenchDataset, SFE, ChartMimic, MMVMBench, XLRSBench,
283284
OmniEarthMCQBench, VisFactor, OSTDataset, OCRBench_v2, TreeBench, CVQA, M4Bench,
284285
AyaVisionBench, TopViewRS, VLMBias, MMHELIX, MedqbenchMCQDataset, MathCanvas, MMReason,
285-
MedqbenchPairedDescriptionDataset, MedqbenchCaptionDataset, ChartMuseum, ChartQAPro, ReasonMap_Plus,
286+
MedqbenchPairedDescriptionDataset, MedqbenchCaptionDataset, ChartMuseum, ChartQAPro, ReasonMap_Plus, # noqa: E501
286287
olmOCRBench, OceanOCRBench, MATBench, VLRMBench, RefCOCODataset, RefSpatialDataset,
287288
ERQADataset, SimpleVQA, HiPhODataset, MaCBench,
288289
UniSVG, SArena, VLMsAreBiased, MMESCIDataset, CoreCognition, GroundingME,
289-
FoxBench, VTCBench, Asclepius, PlotQA, ChartX, ChartBench, ChartCapDataset, WorldVQA, PuzzleVQA, VisualPuzzles,
290-
MMSafetyBenchDataset, MSSBenchDataset, SIUODataset, SIUOGenDataset, SIUOMCQDataset, M3oralBenchDataset,
291-
Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning, MMOral_OPG_OPEN
290+
FoxBench, VTCBench, Asclepius, PlotQA, ChartX, ChartBench, ChartCapDataset, WorldVQA, PuzzleVQA, VisualPuzzles, # noqa: E501
291+
MMSafetyBenchDataset, MSSBenchDataset, SIUODataset, SIUOGenDataset, SIUOMCQDataset, M3oralBenchDataset, # noqa: E501
292+
Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning, MMOral_OPG_OPEN, MMOral_OPG_CLOSED # noqa: E501
292293
]
293294

294295
# add by EASI team
295296
IMAGE_DATASET += [
296297
MindCubeBench, EmbSpatialBench, ViewSpatialBench, MMSIBench, SiteBenchImage,
297-
SparBench, SpatialVizBench, StareBench, OmniSpatialBench, ERQABench, RoboSpatialBench, RefSpatialBench,
298+
SparBench, SpatialVizBench, StareBench, OmniSpatialBench, ERQABench, RoboSpatialBench, RefSpatialBench, # noqa: E501
298299
SPBench, ERIQBench, DA2K
299300
]
300301

@@ -311,7 +312,7 @@ def evaluate(self, eval_file, **judge_kwargs):
311312
]
312313

313314
# add by EASI team
314-
VIDEO_DATASET += [SiteBenchVideo, VsiBench, VsiSuperRecall, VsiSuperCount, MMSIVideoBench, STIBench, DSRBench]
315+
VIDEO_DATASET += [SiteBenchVideo, VsiBench, VsiSuperRecall, VsiSuperCount, MMSIVideoBench, STIBench, DSRBench] # noqa: E501
315316

316317
TEXT_DATASET = [
317318
TextMCQDataset, SGI_Bench_Wet_Experiment, SGI_Bench_Dry_Experiment,
@@ -410,6 +411,11 @@ def infer_dataset_basename(dataset_name):
410411

411412

412413
__all__ = [
413-
'build_dataset', 'img_root_map', 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'DEBUG_MESSAGE'
414+
'build_dataset',
415+
'img_root_map',
416+
'build_judge',
417+
'extract_answer_from_item',
418+
'prefetch_answer',
419+
'DEBUG_MESSAGE',
414420
]
415421
__all__.extend([cls.__name__ for cls in DATASET_CLASSES])
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
import os
2+
import os.path as osp
3+
from collections import defaultdict
4+
5+
import pandas as pd
6+
from tqdm import tqdm
7+
8+
from ..smp import decode_base64_to_image_file, dump, load, read_ok
9+
from .image_base import ImageBaseDataset
10+
11+
12+
class MMOralBase(ImageBaseDataset):
13+
"""Shared image-dumping logic for MMOral-OPG benchmarks."""
14+
15+
def dump_image(self, line):
16+
os.makedirs(self.img_root, exist_ok=True)
17+
18+
tgt_path_z = []
19+
if isinstance(line['image'], list):
20+
for i in range(len(line['image'])):
21+
tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
22+
if not read_ok(tgt_path):
23+
decode_base64_to_image_file(line['image'][i], tgt_path)
24+
tgt_path_z.append(tgt_path)
25+
else:
26+
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
27+
if not read_ok(tgt_path):
28+
decode_base64_to_image_file(line['image'], tgt_path)
29+
tgt_path_z.append(tgt_path)
30+
return tgt_path_z
31+
32+
33+
class MMOral_OPG_CLOSED(MMOralBase):
34+
"""Closed-ended MMOral-OPG benchmark (4-option MCQ)."""
35+
36+
TYPE = 'MCQ'
37+
38+
DATASET_URL = {
39+
'MMOral_OPG_CLOSED': 'https://huggingface.co/datasets/OralGPT/MMOral-OPG-Bench/resolve/main/MMOral-OPG-Bench-Closed-Ended.tsv' # noqa: E501
40+
}
41+
42+
DATASET_MD5 = {
43+
'MMOral_OPG_CLOSED': 'b13cff13ffce25225d5de0efed8e53fa'
44+
}
45+
46+
def build_prompt(self, line):
47+
if isinstance(line, int):
48+
line = self.data.iloc[line]
49+
50+
tgt_path = self.dump_image(line)
51+
question = line['question']
52+
53+
options_prompt = 'Options:\n'
54+
for i in [['A', '1'], ['B', '2'], ['C', '3'], ['D', '4']]:
55+
option_value = str(line[f'option{i[1]}'])
56+
options_prompt += f"{i[0]}. {option_value}\n"
57+
58+
prompt = (
59+
f'Question: {question}\n'
60+
+ options_prompt
61+
+ 'Please answer the above multiple-choice question by selecting the single correct option (A, B, C, or D). ' # noqa: E501
62+
+ 'If the provided information is insufficient to determine a clear answer, please choose the most likely ' # noqa: E501
63+
+ 'correct option based on the available data and your judgment.'
64+
)
65+
66+
msgs = []
67+
if isinstance(tgt_path, list):
68+
msgs.extend([dict(type='image', value=p) for p in tgt_path])
69+
else:
70+
msgs = [dict(type='image', value=tgt_path)]
71+
msgs.append(dict(type='text', value=prompt))
72+
73+
return msgs
74+
75+
@classmethod
76+
def evaluate(cls, eval_file, **judge_kwargs):
77+
"""Direct accuracy evaluation on single-choice predictions."""
78+
from .utils.mmoral_opg import get_single_choice_prediction
79+
80+
suffix = eval_file.split('.')[-1]
81+
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
82+
detail_result_file = eval_file.replace(f'.{suffix}', '_detailed_acc.csv')
83+
84+
if not osp.exists(result_file) or not osp.exists(detail_result_file):
85+
data = load(eval_file)
86+
assert 'answer' in data and 'prediction' in data
87+
data['prediction'] = [str(x) for x in data['prediction']]
88+
data['answer'] = [str(x) for x in data['answer']]
89+
90+
tot = defaultdict(lambda: 0)
91+
score = defaultdict(lambda: 0)
92+
93+
main_category_list = ['Teeth', 'Patho', 'HisT', 'Jaw', 'SumRec']
94+
categories = set()
95+
subcategories = set()
96+
97+
for _, line in data.iterrows():
98+
category = line.get('category', 'unknown')
99+
categories.add(category)
100+
subcategory = category.replace(',', '_')
101+
subcategories.add(subcategory)
102+
103+
for main_cat in main_category_list:
104+
if main_cat in category:
105+
tot[main_cat] += 1
106+
107+
tot[category] += 1
108+
tot[subcategory] += 1
109+
tot['Overall'] += 1
110+
111+
for _, line in tqdm(data.iterrows()):
112+
category = line.get('category', 'unknown')
113+
subcategory = category.replace(',', '_')
114+
115+
index2ans = {
116+
'A': line['option1'],
117+
'B': line['option2'],
118+
'C': line['option3'],
119+
'D': line['option4'],
120+
}
121+
122+
fact_option = get_single_choice_prediction(
123+
line['prediction'], ['A', 'B', 'C', 'D'], index2ans
124+
)
125+
126+
if fact_option == line['answer']:
127+
for main_cat in main_category_list:
128+
if main_cat in category:
129+
score[main_cat] += 1
130+
131+
score[category] += 1
132+
score[subcategory] += 1
133+
score['Overall'] += 1
134+
135+
main_result = defaultdict(list)
136+
main_category_list.append('Overall')
137+
for cat in main_category_list:
138+
main_result['Category'].append(cat)
139+
main_result['tot'].append(tot[cat])
140+
main_result['acc'].append(
141+
score[cat] / tot[cat] * 100 if tot[cat] > 0 else 0
142+
)
143+
144+
detailed_categories = list(categories) + ['Overall']
145+
detailed_result = defaultdict(list)
146+
for cat in detailed_categories:
147+
detailed_result['Category'].append(cat)
148+
detailed_result['tot'].append(tot[cat])
149+
detailed_result['acc'].append(
150+
score[cat] / tot[cat] * 100 if tot[cat] > 0 else 0
151+
)
152+
153+
main_df = pd.DataFrame(main_result)
154+
detailed_df = pd.DataFrame(detailed_result)
155+
156+
main_df = main_df.sort_values('Category')
157+
detailed_df = detailed_df.sort_values('Category')
158+
159+
dump(main_df, result_file)
160+
dump(detailed_df, detail_result_file)
161+
162+
result = pd.read_csv(result_file)
163+
return result

vlmeval/dataset/utils/mmoral_opg.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import random
12
from collections import defaultdict
23

34
import pandas as pd
@@ -120,3 +121,46 @@ def MMOral_opg_acc(result_file):
120121
res = pd.DataFrame(res)
121122
res2 = pd.DataFrame(res2)
122123
return res, res2
124+
125+
126+
def get_single_choice_prediction(response, all_choices, index2ans):
127+
for char in [',', '.', '!', '?', ';', ':', "'"]:
128+
response = response.strip(char)
129+
response = ' ' + response + ' ' # add space to avoid partial match
130+
131+
candidates = []
132+
133+
for choice in all_choices:
134+
if f'({choice})' in response:
135+
candidates.append(choice)
136+
137+
if len(candidates) == 0:
138+
for choice in all_choices:
139+
if f' {choice} ' in response:
140+
candidates.append(choice)
141+
elif f' {choice}.' in response:
142+
candidates.append(choice)
143+
elif f' {choice},' in response:
144+
candidates.append(choice)
145+
146+
if len(candidates) == 0:
147+
for index, ans in index2ans.items():
148+
ans_str = str(ans)
149+
if ans_str in response:
150+
candidates.append(index)
151+
152+
if len(candidates) > 0:
153+
positions = {}
154+
for c in candidates:
155+
pos = response.find(f' {c} ')
156+
if pos == -1:
157+
pos = response.find(f'({c})')
158+
if pos == -1:
159+
pos = response.find(str(index2ans[c]))
160+
if pos != -1:
161+
positions[c] = pos
162+
163+
if positions:
164+
return min(positions.items(), key=lambda x: x[1])[0]
165+
166+
return random.choice(all_choices)

0 commit comments

Comments
 (0)