Skip to content

Commit 589fe36

Browse files
isjinghaoCopilotmzr1996
authored
[Benchmark] Add support for MMOral-OPG-Open benchmark (#1484)
* [Benchmark] Add support for MMOral-OPG-Closed * Prepare MMOral OPG utils and dataset wiring * [Benchmark] Add support for MMOral-OPG-OPEN benchmark * fix * Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> * Fix pre-commit/flake8 formatting * Fix flake8 issues in mmoral_opg dataset utils * Fix flake8/isort for MMOral-OPG open dataset --------- Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Co-authored-by: Ma Zerun <mzr1996@163.com>
1 parent 401916c commit 589fe36

3 files changed

Lines changed: 241 additions & 1 deletion

File tree

vlmeval/dataset/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
from .mmifeval import MMIFEval
8484
from .mmlongbench import MMLongBench
8585
from .mmmath import MMMath
86+
from .mmoral_opg_open import MMOral_OPG_OPEN
8687
from .mmsafetybench import MMSafetyBenchDataset
8788
from .mmsibench import MMSIBench, MMSIVideoBench
8889
from .moat import MOAT
@@ -287,7 +288,7 @@ def evaluate(self, eval_file, **judge_kwargs):
287288
UniSVG, SArena, VLMsAreBiased, MMESCIDataset, CoreCognition, GroundingME,
288289
FoxBench, VTCBench, Asclepius, PlotQA, ChartX, ChartBench, ChartCapDataset, WorldVQA, PuzzleVQA, VisualPuzzles,
289290
MMSafetyBenchDataset, MSSBenchDataset, SIUODataset, SIUOGenDataset, SIUOMCQDataset, M3oralBenchDataset,
290-
Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning
291+
Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning, MMOral_OPG_OPEN
291292
]
292293

293294
# add by EASI team

vlmeval/dataset/mmoral_opg_open.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
import os
2+
import os.path as osp
3+
4+
from ..smp import decode_base64_to_image_file, dump, load, read_ok
5+
from ..utils import track_progress_rich
6+
from .image_base import ImageBaseDataset
7+
from .utils import DEBUG_MESSAGE, build_judge
8+
9+
10+
class MMOralBase(ImageBaseDataset):
11+
"""Shared image-dumping logic for MMOral-OPG benchmarks."""
12+
13+
def dump_image(self, line):
14+
os.makedirs(self.img_root, exist_ok=True)
15+
16+
tgt_path_z = []
17+
if isinstance(line['image'], list):
18+
for i in range(len(line['image'])):
19+
tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
20+
if not read_ok(tgt_path):
21+
decode_base64_to_image_file(line['image'][i], tgt_path)
22+
tgt_path_z.append(tgt_path)
23+
else:
24+
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
25+
if not read_ok(tgt_path):
26+
decode_base64_to_image_file(line['image'], tgt_path)
27+
tgt_path_z.append(tgt_path)
28+
return tgt_path_z
29+
30+
31+
class MMOral_OPG_OPEN(MMOralBase):
32+
"""Open-ended MMOral-OPG benchmark (VQA)."""
33+
34+
TYPE = 'VQA'
35+
36+
DATASET_URL = {
37+
'MMOral_OPG_OPEN': 'https://huggingface.co/datasets/OralGPT/MMOral-OPG-Bench/resolve/main/MMOral-OPG-Bench-Open-Ended.tsv' # noqa: E501
38+
}
39+
40+
DATASET_MD5 = {
41+
'MMOral_OPG_OPEN': 'd328b1b527ef7467b328d8b35d5f8155'
42+
}
43+
44+
def build_prompt(self, line):
45+
if isinstance(line, int):
46+
line = self.data.iloc[line]
47+
48+
tgt_path = self.dump_image(line)
49+
question = line['question']
50+
prompt = (
51+
f'Question: {question}\n'
52+
'Please provide a detailed and accurate answer to the question.'
53+
)
54+
55+
msgs = []
56+
if isinstance(tgt_path, list):
57+
msgs.extend([dict(type='image', value=p) for p in tgt_path])
58+
else:
59+
msgs = [dict(type='image', value=tgt_path)]
60+
msgs.append(dict(type='text', value=prompt))
61+
62+
return msgs
63+
64+
@classmethod
65+
def evaluate(cls, eval_file, **judge_kwargs):
66+
"""Evaluation with LLM-as-a-judge for open-ended answers."""
67+
from .utils.mmoral_opg import MMOral_opg_acc, MMOral_opg_auxeval
68+
69+
suffix = eval_file.split('.')[-1]
70+
# Some call sites may not explicitly set `judge_kwargs['model']`,
71+
# so we fall back to a default name for the judge model.
72+
judge_model_name = judge_kwargs.pop('model', 'mmoral-opg-judge')
73+
storage = eval_file.replace(f'.{suffix}', f'_{judge_model_name}.xlsx')
74+
tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_model_name}.pkl')
75+
nproc = judge_kwargs.pop('nproc', 4)
76+
77+
if not osp.exists(storage):
78+
data = load(eval_file)
79+
model = build_judge(model=judge_model_name, max_tokens=16384, **judge_kwargs)
80+
assert model.working(), (
81+
'MMOral-Open-ended evaluation requires a working OPENAI API\n'
82+
+ DEBUG_MESSAGE
83+
)
84+
85+
lt = len(data)
86+
lines = [data.iloc[i] for i in range(lt)]
87+
tups = [(model, line) for line in lines]
88+
indices = [line['index'] for line in lines]
89+
90+
ans = load(tmp_file) if osp.exists(tmp_file) else {}
91+
tups = [x for x, i in zip(tups, indices) if i not in ans]
92+
indices = [i for i in indices if i not in ans]
93+
94+
if len(indices):
95+
new_results = track_progress_rich(
96+
MMOral_opg_auxeval,
97+
tups,
98+
nproc=nproc,
99+
chunksize=nproc,
100+
keys=indices,
101+
save=tmp_file,
102+
)
103+
ans = load(tmp_file)
104+
for k, v in zip(indices, new_results):
105+
assert k in ans
106+
assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
107+
108+
data['score'] = [ans[idx]['score'] for idx in data['index']]
109+
data['log'] = [ans[idx]['log'] for idx in data['index']]
110+
dump(data, storage)
111+
112+
score, score_fine = MMOral_opg_acc(storage)
113+
score_pth = storage.replace('.xlsx', '_score.csv')
114+
score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
115+
dump(score, score_pth)
116+
dump(score_fine, score_fine_pth)
117+
return score
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
from collections import defaultdict
2+
3+
import pandas as pd
4+
5+
from ...smp.file import load
6+
7+
# for MMOral-OPG-Bench
8+
9+
10+
def build_mmoral_opg_gpt4_prompt(line):
11+
question = line['question']
12+
gt = str(line['answer'])
13+
prediction = str(line['prediction'])
14+
# Keep this prompt readable and flake8-friendly (avoid overly long lines).
15+
prompt = """
16+
Given the question, compare the ground truth and prediction from AI
17+
models, to generate a correctness score for the prediction.
18+
The correctness score is 0.0 (totally wrong), 0.1, 0.2, 0.3, 0.4, 0.5,
19+
0.6, 0.7, 0.8, 0.9, or 1.0 (totally right).
20+
Just complete the last space of the correctness score.
21+
22+
Question | Ground truth | Prediction | Correctness
23+
--- | --- | --- | ---
24+
How many teeth are visualized in the radiograph? | 30 teeth are visualized with clear anatomical
25+
definition. | 30 | 1.0
26+
How many teeth are visualized in the radiograph? | 30 teeth are visualized with clear anatomical
27+
definition. | 29 teeth are visualized with clear anatomical definition. | 0.0
28+
What is the status of the wisdom teeth in the radiograph? | Three
29+
wisdom teeth are detected, all of which are impacted: #18, #28, and #48.
30+
| #18: impacted, #28: impacted, #48: erupted | 0.7
31+
What is the condition of the teeth #26 and #14? | Teeth #26 and #14
32+
show signs of periapical abscesses. | Teeth #26 and #23 show signs
33+
of periapical abscesses. | 0.5
34+
What is the condition of the bone architecture and visible structures in
35+
the jaw? | No apparent bone loss is observed. Bilateral mandibular
36+
canals and maxillary sinuses are clearly visible. | Bilateral
37+
mandibular canals and maxillary sinuses are clearly visible. | 0.5
38+
What is the clinical priority concern regarding the periapical lesions?
39+
| Periapical cysts at #11 and #12, and granuloma at #46 require
40+
endodontic evaluation. | Periapical lesions at #11, #12, and #46
41+
require endodontic evaluation. | 0.8
42+
What radiographic features are visible in tooth #31 on the panoramic X-ray? | [\n
43+
{\"Teeth position\": {\"point_2d\": [1242, 726]}},\n
44+
{\"Crown\": {\"box_2d\": [1220, 637, 1266, 741]}}\n
45+
] | Crown | 0.8
46+
What radiographic features are visible in tooth #31 on the panoramic X-ray? | [\n
47+
{\"Teeth position\": {\"point_2d\": [1242, 726]}},\n
48+
{\"Crown\": {\"box_2d\": [1220, 637, 1266, 741]}}\n
49+
] | Crown at position: [1230, 627, 1276, 750] | 0.9
50+
What radiographic features are visible in tooth #31 on the panoramic X-ray? | [\n
51+
{\"Teeth position\": {\"point_2d\": [1242, 726]}},\n
52+
{\"Crown\": {\"box_2d\": [1220, 637, 1266, 741]}}\n
53+
] | Teeth at position: {\"point_2d\": [1242, 726]}},\n
54+
{Crown at position: {\"box_2d\": [1230, 627, 1276, 750]}} | 1.0
55+
"""
56+
gpt4_prompt = prompt + '\n' + ' | '.join(
57+
[question, gt.replace('<AND>', ' <AND> ').replace('<OR>', ' <OR> '), prediction, ''])
58+
return gpt4_prompt
59+
60+
61+
def MMOral_opg_auxeval(model, line):
62+
def float_cvt(s):
63+
try:
64+
return float(s)
65+
except ValueError:
66+
return None
67+
68+
prompt = build_mmoral_opg_gpt4_prompt(line)
69+
log = ''
70+
retry = 5
71+
for i in range(retry):
72+
output = model.generate(prompt, temperature=i * 0.5)
73+
score = float_cvt(output)
74+
if score is None:
75+
log += f'Try {i}: output is {output}, failed to parse.\n'
76+
elif score < 0 or score > 1:
77+
log += f'Try {i}: output is {output}, invalid score: {score}.\n'
78+
else:
79+
log += 'Succeed'
80+
return dict(log=log, score=score)
81+
log += 'All 5 retries failed.\n'
82+
return dict(log=log, score=0.0)
83+
84+
85+
def MMOral_opg_acc(result_file):
86+
data = load(result_file)
87+
tot = defaultdict(lambda: 0)
88+
score = defaultdict(lambda: 0)
89+
lt = len(data)
90+
cate2_list = []
91+
for i in range(lt):
92+
item = data.iloc[i]
93+
cate = item['category']
94+
cate2 = cate.replace(',', '_')
95+
if cate2 not in cate2_list:
96+
cate2_list.append(cate2)
97+
grade = float(item['score'])
98+
cate_list = ['Teeth', 'Patho', 'HisT', 'Jaw', 'SumRec', 'Report']
99+
for capa in cate_list:
100+
if capa in cate:
101+
tot[capa] += 1
102+
score[capa] += grade
103+
tot['Overall'] += 1
104+
tot[cate2] += 1
105+
score['Overall'] += grade
106+
score[cate2] += grade
107+
108+
res = defaultdict(list)
109+
res2 = defaultdict(list)
110+
cate_list.append('Overall')
111+
cate2_list.append('Overall')
112+
for k in cate_list:
113+
res['Category'].append(k)
114+
res['tot'].append(tot[k])
115+
res['acc'].append(score[k] / tot[k] * 100)
116+
for v in cate2_list:
117+
res2['Category'].append(v)
118+
res2['tot'].append(tot[v])
119+
res2['acc'].append(score[v] / tot[v] * 100)
120+
res = pd.DataFrame(res)
121+
res2 = pd.DataFrame(res2)
122+
return res, res2

0 commit comments

Comments
 (0)