@@ -24,7 +24,17 @@ def is_float(val):
2424 return False
2525
2626
27- def csv2triples (csv , separator = '\\ t' , delimiter = '\\ n' ):
27+ def extract_csv (text ):
28+ text = text .strip ()
29+ if '```csv' in text :
30+ text = text .split ('```csv' )[1 ].split ('```' )[0 ]
31+ elif '```' in text :
32+ text = text .split ('```' )[1 ].split ('```' )[0 ]
33+ return text .strip ()
34+
35+
36+ def csv2triples (csv , separator = '\t ' , delimiter = '\n ' ):
37+ csv = extract_csv (csv )
2838 lines = csv .strip ().split (delimiter )
2939 if not lines :
3040 return []
@@ -206,31 +216,32 @@ def chartx_scrm_eval(predictions, references, easy=1):
206216 {
207217 "query" : "<question> What was the incremental increase in revenue from 2020 to 2021? "
208218 "<groundtruth answer> 5 million $ <answer> 20\n </s>" ,
209- "answer" : "False"
219+ "answer" : "Answer: False"
210220 },
211221 {
212222 "query" : "<question> What percentage of government spending was allocated to infrastructure in 2020? "
213223 "<groundtruth answer> 10% <answer> 14-4=10\n </s>" ,
214- "answer" : "True"
224+ "answer" : "Answer: True"
215225 },
216226 {
217227 "query" : "<question> What is the total production of Wind Energy in the four months from January to "
218228 "April 2021? <groundtruth answer> 2300 MW <answer> The total production of Wind Energy in the four "
219229 "months from January to April 2021 is 2450 MW." ,
220- "answer" : "True"
230+ "answer" : "Answer: True"
221231 },
222232 {
223233 "query" : "<question> What is the total of manufactured goods for UK and Germany combined? "
224234 "<groundtruth answer> 5 <answer> Five" ,
225- "answer" : "True"
235+ "answer" : "Answer: True"
226236 },
227237]
228238
229239QA_PREFIX = (
230240 "Given multiple question-answer pairs and the corresponding predictions, evaluate the correctness of "
231241 "predictions. The output should be only \" True\" or \" False\" . Note that if the groundtruth answer is a "
232242 "numeric value with/without the unit, impose 5% error tolerance to the answer, e.g., the answer of 95 is "
233- "marked as correct when groundtruth value is 100 million."
243+ "marked as correct when groundtruth value is 100 million.\n "
244+ "Please provide your evaluation result at the very end in the format: 'Answer: True' or 'Answer: False'."
234245)
235246QA_SUFFIX = """
236247 User: {query}
@@ -249,6 +260,7 @@ def chartx_scrm_eval(predictions, references, easy=1):
249260 5 points: Comprehensive, accurate description; excellent understanding with no errors; clear and
250261 detailed, perfect as a standalone explanation.
251262 Score the model's description on this scale, providing a single value without providing any reasons.
263+ Please provide your evaluation score at the very end in the format: 'Score: <score>'.
252264 """ ,
253265 'summary' : """
254266 You're an expert evaluating a model's summarization of a chart, based on its alignment with the
@@ -268,6 +280,7 @@ def chartx_scrm_eval(predictions, references, easy=1):
268280 aspects of the original text. It demonstrates excellent understanding, is error-free, clear, concise,
269281 well-structured, and serves as an excellent standalone representation of the original content.
270282 Score the model's summarization on this scale, providing a single value without providing any reasons.
283+ Please provide your evaluation score at the very end in the format: 'Score: <score>'.
271284 """ ,
272285 'redrawing' : """
273286 You're an expert evaluating a redrawing code of a chart, based on its alignment with the ground truth
@@ -286,6 +299,7 @@ def chartx_scrm_eval(predictions, references, easy=1):
286299 5 points: Comprehensive, Accurate Code; Excellent Understanding, No Errors. Perfectly replicates all
287300 details and data of the chart. Generated chart is indistinguishable from the original, flawless.
288301 Score the redrawing code on this scale, providing a single value without providing ant reasons.
302+ Please provide your evaluation score at the very end in the format: 'Score: <score>'.
289303 """
290304}
291305
@@ -324,13 +338,18 @@ def ChartX_auxeval(model, item):
324338 response = model .generate (prompt )
325339
326340 score = 0
327- if 'True' in response :
328- score = 1
329- elif 'False' in response :
330- score = 0
331- else :
332- # simple fallback
333- score = 0
341+ try :
342+ hits = re .findall (r'Answer:\s*(True|False)' , response , re .IGNORECASE )
343+ if hits :
344+ score = 1 if hits [- 1 ].lower () == 'true' else 0
345+ else :
346+ # Fallback strictly to first occurrence if formatting failed
347+ response_lower = response .lower ()
348+ hits = re .findall (r'\b(true|false)\b' , response_lower )
349+ if hits :
350+ score = 1 if hits [0 ] == 'true' else 0
351+ except Exception :
352+ pass
334353
335354 return {'score' : score , 'log' : response }
336355
@@ -351,7 +370,7 @@ def ChartX_auxeval(model, item):
351370 metadata_str = item .get ('metadata' , '{}' )
352371 try :
353372 metadata = json .loads (metadata_str )
354- except BaseException :
373+ except Exception :
355374 metadata = {}
356375
357376 title = metadata .get ('title' , 'Unknown Title' )
@@ -377,11 +396,15 @@ def ChartX_auxeval(model, item):
377396 # Extract score 0-5
378397 score = 0
379398 try :
380- # find first digit 0-5
381- hits = re .findall (r'[0-5]' , response )
399+ hits = re .findall (r'Score:\s*([0-5])' , response , re .IGNORECASE )
382400 if hits :
383- score = int (hits [0 ])
384- except BaseException :
401+ score = int (hits [- 1 ])
402+ else :
403+ # Fallback
404+ hits = re .findall (r'[0-5]' , response )
405+ if hits :
406+ score = int (hits [- 1 ])
407+ except Exception :
385408 pass
386409
387410 return {'score' : score , 'log' : response }
0 commit comments