Skip to content

Commit f67e576

Browse files
authored
[Fix] Fix ChartX Evaluation Robustness. (#1489)
* Fix ChartX evaluation robustness * change BaseException to Exception
1 parent 6e7e372 commit f67e576

1 file changed

Lines changed: 41 additions & 18 deletions

File tree

vlmeval/dataset/utils/chartx_eval.py

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,17 @@ def is_float(val):
2424
return False
2525

2626

27-
def csv2triples(csv, separator='\\t', delimiter='\\n'):
27+
def extract_csv(text):
28+
text = text.strip()
29+
if '```csv' in text:
30+
text = text.split('```csv')[1].split('```')[0]
31+
elif '```' in text:
32+
text = text.split('```')[1].split('```')[0]
33+
return text.strip()
34+
35+
36+
def csv2triples(csv, separator='\t', delimiter='\n'):
37+
csv = extract_csv(csv)
2838
lines = csv.strip().split(delimiter)
2939
if not lines:
3040
return []
@@ -206,31 +216,32 @@ def chartx_scrm_eval(predictions, references, easy=1):
206216
{
207217
"query": "<question> What was the incremental increase in revenue from 2020 to 2021? "
208218
"<groundtruth answer> 5 million $ <answer> 20\n</s>",
209-
"answer": "False"
219+
"answer": "Answer: False"
210220
},
211221
{
212222
"query": "<question> What percentage of government spending was allocated to infrastructure in 2020? "
213223
"<groundtruth answer> 10% <answer> 14-4=10\n</s>",
214-
"answer": "True"
224+
"answer": "Answer: True"
215225
},
216226
{
217227
"query": "<question> What is the total production of Wind Energy in the four months from January to "
218228
"April 2021? <groundtruth answer> 2300 MW <answer> The total production of Wind Energy in the four "
219229
"months from January to April 2021 is 2450 MW.",
220-
"answer": "True"
230+
"answer": "Answer: True"
221231
},
222232
{
223233
"query": "<question> What is the total of manufactured goods for UK and Germany combined? "
224234
"<groundtruth answer> 5 <answer> Five",
225-
"answer": "True"
235+
"answer": "Answer: True"
226236
},
227237
]
228238

229239
QA_PREFIX = (
230240
"Given multiple question-answer pairs and the corresponding predictions, evaluate the correctness of "
231241
"predictions. The output should be only \"True\" or \"False\". Note that if the groundtruth answer is a "
232242
"numeric value with/without the unit, impose 5% error tolerance to the answer, e.g., the answer of 95 is "
233-
"marked as correct when groundtruth value is 100 million."
243+
"marked as correct when groundtruth value is 100 million.\n"
244+
"Please provide your evaluation result at the very end in the format: 'Answer: True' or 'Answer: False'."
234245
)
235246
QA_SUFFIX = """
236247
User: {query}
@@ -249,6 +260,7 @@ def chartx_scrm_eval(predictions, references, easy=1):
249260
5 points: Comprehensive, accurate description; excellent understanding with no errors; clear and
250261
detailed, perfect as a standalone explanation.
251262
Score the model's description on this scale, providing a single value without providing any reasons.
263+
Please provide your evaluation score at the very end in the format: 'Score: <score>'.
252264
""",
253265
'summary': """
254266
You're an expert evaluating a model's summarization of a chart, based on its alignment with the
@@ -268,6 +280,7 @@ def chartx_scrm_eval(predictions, references, easy=1):
268280
aspects of the original text. It demonstrates excellent understanding, is error-free, clear, concise,
269281
well-structured, and serves as an excellent standalone representation of the original content.
270282
Score the model's summarization on this scale, providing a single value without providing any reasons.
283+
Please provide your evaluation score at the very end in the format: 'Score: <score>'.
271284
""",
272285
'redrawing': """
273286
You're an expert evaluating a redrawing code of a chart, based on its alignment with the ground truth
@@ -286,6 +299,7 @@ def chartx_scrm_eval(predictions, references, easy=1):
286299
5 points: Comprehensive, Accurate Code; Excellent Understanding, No Errors. Perfectly replicates all
287300
details and data of the chart. Generated chart is indistinguishable from the original, flawless.
288301
Score the redrawing code on this scale, providing a single value without providing ant reasons.
302+
Please provide your evaluation score at the very end in the format: 'Score: <score>'.
289303
"""
290304
}
291305

@@ -324,13 +338,18 @@ def ChartX_auxeval(model, item):
324338
response = model.generate(prompt)
325339

326340
score = 0
327-
if 'True' in response:
328-
score = 1
329-
elif 'False' in response:
330-
score = 0
331-
else:
332-
# simple fallback
333-
score = 0
341+
try:
342+
hits = re.findall(r'Answer:\s*(True|False)', response, re.IGNORECASE)
343+
if hits:
344+
score = 1 if hits[-1].lower() == 'true' else 0
345+
else:
346+
# Fallback strictly to first occurrence if formatting failed
347+
response_lower = response.lower()
348+
hits = re.findall(r'\b(true|false)\b', response_lower)
349+
if hits:
350+
score = 1 if hits[0] == 'true' else 0
351+
except Exception:
352+
pass
334353

335354
return {'score': score, 'log': response}
336355

@@ -351,7 +370,7 @@ def ChartX_auxeval(model, item):
351370
metadata_str = item.get('metadata', '{}')
352371
try:
353372
metadata = json.loads(metadata_str)
354-
except BaseException:
373+
except Exception:
355374
metadata = {}
356375

357376
title = metadata.get('title', 'Unknown Title')
@@ -377,11 +396,15 @@ def ChartX_auxeval(model, item):
377396
# Extract score 0-5
378397
score = 0
379398
try:
380-
# find first digit 0-5
381-
hits = re.findall(r'[0-5]', response)
399+
hits = re.findall(r'Score:\s*([0-5])', response, re.IGNORECASE)
382400
if hits:
383-
score = int(hits[0])
384-
except BaseException:
401+
score = int(hits[-1])
402+
else:
403+
# Fallback
404+
hits = re.findall(r'[0-5]', response)
405+
if hits:
406+
score = int(hits[-1])
407+
except Exception:
385408
pass
386409

387410
return {'score': score, 'log': response}

0 commit comments

Comments
 (0)