Skip to content

Commit a636e4f

Browse files
inigopminakiLakunza
andcommitted
[Benchmark] Add support for MaXM.
Register MaXM and its language splits with hosted TSV metadata so the benchmark can be downloaded and evaluated through the standard VLMEvalKit dataset flow. Co-authored-by: Iñaki Lakunza <136484940+inakiLakunza@users.noreply.github.com> Made-with: Cursor
1 parent a3656d4 commit a636e4f

2 files changed

Lines changed: 168 additions & 0 deletions

File tree

vlmeval/dataset/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
from .m4bench import M4Bench
6969
from .macbench import MaCBench
7070
from .matbench import MATBench
71+
from .maxm import MaXM, MaXM_en, MaXM_fr, MaXM_hi, MaXM_iw, MaXM_ro, MaXM_th, MaXM_zh
7172
from .medqbench_caption import MedqbenchCaptionDataset
7273
from .medqbench_mcq import MedqbenchMCQDataset
7374
from .medqbench_paired_description import MedqbenchPairedDescriptionDataset
@@ -282,6 +283,7 @@ def evaluate(self, eval_file, **judge_kwargs):
282283
ZEROBench, SCAM, Omni3DBench, TallyQA, _3DSRBench, BMMR, AffordanceDataset,
283284
MMEReasoning, GOBenchDataset, SFE, ChartMimic, MMVMBench, XLRSBench,
284285
OmniEarthMCQBench, VisFactor, OSTDataset, OCRBench_v2, TreeBench, CVQA, M4Bench,
286+
MaXM, MaXM_en, MaXM_fr, MaXM_hi, MaXM_iw, MaXM_ro, MaXM_th, MaXM_zh,
285287
AyaVisionBench, TopViewRS, VLMBias, MMHELIX, MedqbenchMCQDataset, MathCanvas, MMReason,
286288
MedqbenchPairedDescriptionDataset, MedqbenchCaptionDataset, ChartMuseum, ChartQAPro, ReasonMap_Plus, # noqa: E501
287289
olmOCRBench, OceanOCRBench, MATBench, VLRMBench, RefCOCODataset, RefSpatialDataset,

vlmeval/dataset/maxm.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
"""
2+
VLMEvalKit dataset class for MaXM.
3+
"""
4+
5+
import json
6+
import re
7+
import string
8+
9+
import pandas as pd
10+
11+
from ..smp import load
12+
from .image_base import ImageBaseDataset
13+
14+
LANGUAGES = ['en', 'fr', 'hi', 'iw', 'ro', 'th', 'zh']
15+
16+
HF_ROOT = 'https://huggingface.co/datasets/inigopm/vlmevalkit-maxm-tsv/resolve/main'
17+
DATASET_URL = {
18+
'MaXM': f'{HF_ROOT}/MaXM.tsv',
19+
'MaXM_en': f'{HF_ROOT}/MaXM_en.tsv',
20+
'MaXM_fr': f'{HF_ROOT}/MaXM_fr.tsv',
21+
'MaXM_hi': f'{HF_ROOT}/MaXM_hi.tsv',
22+
'MaXM_iw': f'{HF_ROOT}/MaXM_iw.tsv',
23+
'MaXM_ro': f'{HF_ROOT}/MaXM_ro.tsv',
24+
'MaXM_th': f'{HF_ROOT}/MaXM_th.tsv',
25+
'MaXM_zh': f'{HF_ROOT}/MaXM_zh.tsv',
26+
}
27+
DATASET_MD5 = {
28+
'MaXM': 'edc625b8627bd2c2b2054c1c1598c7b6',
29+
'MaXM_en': 'e9829f8289c9957b142f8daee68634e9',
30+
'MaXM_fr': 'c940238668b9e6cb94797d3b64624890',
31+
'MaXM_hi': 'e4c1fc7402ea0fa475c66437c5f8063c',
32+
'MaXM_iw': '1cf0d5ee2544c300ae620d891cf3f84a',
33+
'MaXM_ro': '351ab40f15968819df0ff4f4945160f2',
34+
'MaXM_th': '67850a5d069529875cf2ed776fc2e39e',
35+
'MaXM_zh': '909f9aba140b072d6ef7db2bdf3a38f4',
36+
}
37+
38+
39+
def _normalise(text: str) -> str:
40+
text = str(text).lower().strip()
41+
text = text.translate(str.maketrans('', '', string.punctuation))
42+
text = re.sub(r'\s+', ' ', text).strip()
43+
return text
44+
45+
46+
def _parse_answer_list(raw) -> list[str]:
47+
"""Parse list-like answer fields stored as strings or JSON arrays."""
48+
if isinstance(raw, list):
49+
return [str(x) for x in raw]
50+
if isinstance(raw, str):
51+
if '|' in raw:
52+
return [part.strip() for part in raw.split('|') if part.strip()]
53+
matches = re.findall(r"'([^']*)'", raw)
54+
if matches:
55+
return matches
56+
try:
57+
parsed = json.loads(raw)
58+
if isinstance(parsed, list):
59+
return [str(x) for x in parsed]
60+
except Exception:
61+
pass
62+
return [raw]
63+
return [str(raw)]
64+
65+
66+
def _vqa_score(prediction: str, answers: list[str]) -> float:
67+
"""Compute VQA-style soft scoring: min(1, matches / 3)."""
68+
pred_norm = _normalise(prediction)
69+
matches = sum(pred_norm == _normalise(ans) for ans in answers)
70+
return min(1.0, matches / 3.0)
71+
72+
73+
def _avg_score(df: pd.DataFrame) -> float:
74+
if len(df) == 0:
75+
return 0.0
76+
return round(df['score'].mean() * 100, 2)
77+
78+
79+
class MaXMDataset(ImageBaseDataset):
80+
TYPE = 'VQA'
81+
DATASET_URL = DATASET_URL
82+
DATASET_MD5 = DATASET_MD5
83+
84+
def build_prompt(self, line):
85+
if isinstance(line, int):
86+
line = self.data.iloc[line]
87+
88+
img_paths = self.dump_image(line)
89+
if not isinstance(img_paths, list):
90+
img_paths = [img_paths]
91+
92+
question = str(line['question'])
93+
prompt = (
94+
f'{question}\n'
95+
'Answer the question using a single word or short phrase.'
96+
)
97+
98+
msgs = [dict(type='image', value=p) for p in img_paths]
99+
msgs.append(dict(type='text', value=prompt))
100+
return msgs
101+
102+
def evaluate(self, eval_file, **judge_kwargs):
103+
data = load(eval_file)
104+
answer_col = 'processed_answers' if 'processed_answers' in data.columns else 'answers'
105+
106+
data['score'] = data.apply(
107+
lambda row: _vqa_score(row['prediction'], _parse_answer_list(row[answer_col])),
108+
axis=1,
109+
)
110+
111+
rows = []
112+
if 'category' in data.columns:
113+
for lang in sorted(data['category'].unique()):
114+
sub = data[data['category'] == lang]
115+
rows.append({
116+
'dataset': self.dataset_name,
117+
'lang': lang,
118+
'total': len(sub),
119+
'score_sum': round(sub['score'].sum(), 2),
120+
'accuracy (%)': _avg_score(sub),
121+
})
122+
123+
rows.append({
124+
'dataset': self.dataset_name,
125+
'lang': 'overall',
126+
'total': len(data),
127+
'score_sum': round(data['score'].sum(), 2),
128+
'accuracy (%)': _avg_score(data),
129+
})
130+
131+
result_df = pd.DataFrame(rows)
132+
result_path = eval_file.replace('.xlsx', '_MaXM_results.csv')
133+
result_df.to_csv(result_path, index=False)
134+
print(f'\nMaXM results -> {result_path}')
135+
print(result_df.to_string(index=False))
136+
return result_df
137+
138+
139+
def _make_lang_class(lang: str):
140+
name = f'MaXM_{lang}'
141+
return type(
142+
name,
143+
(MaXMDataset,),
144+
{
145+
'__doc__': f'MaXM benchmark - language: {lang}',
146+
'DATASET_URL': {name: DATASET_URL.get(name, '')},
147+
'DATASET_MD5': {name: DATASET_MD5.get(name)},
148+
},
149+
)
150+
151+
152+
MaXM_en = _make_lang_class('en')
153+
MaXM_fr = _make_lang_class('fr')
154+
MaXM_hi = _make_lang_class('hi')
155+
MaXM_iw = _make_lang_class('iw')
156+
MaXM_ro = _make_lang_class('ro')
157+
MaXM_th = _make_lang_class('th')
158+
MaXM_zh = _make_lang_class('zh')
159+
160+
161+
class MaXM(MaXMDataset):
162+
DATASET_URL = {'MaXM': DATASET_URL.get('MaXM', '')}
163+
DATASET_MD5 = {'MaXM': DATASET_MD5.get('MaXM')}
164+
165+
166+
MAXM_DATASETS = ['MaXM'] + [f'MaXM_{lang}' for lang in LANGUAGES]

0 commit comments

Comments
 (0)