open-compass
diff --git a/‎vlmeval/dataset/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎vlmeval/dataset/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vlmeval/dataset/sarena_mini.py‎ ‎vlmeval/dataset/sarena.py‎vlmeval/dataset/sarena_mini.py renamed to vlmeval/dataset/sarena.py
Lines changed: 5 additions & 4 deletions b/‎vlmeval/dataset/sarena_mini.py‎ ‎vlmeval/dataset/sarena.py‎vlmeval/dataset/sarena_mini.py renamed to vlmeval/dataset/sarena.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎vlmeval/dataset/utils/SArena/CLIP_Score.py‎
Lines changed: 5 additions & 3 deletions b/‎vlmeval/dataset/utils/SArena/CLIP_Score.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎vlmeval/dataset/utils/SArena/DINO_Score.py‎
Lines changed: 12 additions & 8 deletions b/‎vlmeval/dataset/utils/SArena/DINO_Score.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎vlmeval/dataset/utils/SArena/LPIPS.py‎
Lines changed: 25 additions & 1 deletion b/‎vlmeval/dataset/utils/SArena/LPIPS.py‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎vlmeval/dataset/utils/SArena/inception.py‎
Lines changed: 12 additions & 1 deletion b/‎vlmeval/dataset/utils/SArena/inception.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎vlmeval/dataset/utils/SArena/metrics.py‎
Lines changed: 3 additions & 2 deletions b/‎vlmeval/dataset/utils/SArena/metrics.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎vlmeval/dataset/utils/SArena/requirements_sarena.txt‎
Lines changed: 18 additions & 0 deletions b/‎vlmeval/dataset/utils/SArena/requirements_sarena.txt‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎vlmeval/dataset/utils/SArena/token_length.py‎
Lines changed: 5 additions & 3 deletions b/‎vlmeval/dataset/utils/SArena/token_length.py‎
Lines changed: 5 additions & 3 deletions
@@ -139,7 +139,7 @@
 from .gsm8k_v import GSM8KVDataset
 from .macbench import MaCBench
 from .mmesci import MMESCIDataset
-from .sarena_mini import SArena_MINI
+from .sarena import SArena
 from .uni_svg import UniSVG
 from .vladbench import VLADBench
 from .design2code import Design2Code
@@ -282,7 +282,7 @@ def evaluate(self, eval_file, **judge_kwargs):
     MedqbenchPairedDescriptionDataset, MedqbenchCaptionDataset, ChartMuseum, ChartQAPro, ReasonMap_Plus,
     olmOCRBench, OceanOCRBench, MATBench, VLRMBench, RefCOCODataset, RefSpatialDataset,
     ERQADataset, SimpleVQA, HiPhODataset, MaCBench,
-    UniSVG, SArena_MINI, VLMsAreBiased, MMESCIDataset, CoreCognition, GroundingME,
+    UniSVG, SArena, VLMsAreBiased, MMESCIDataset, CoreCognition, GroundingME,
     FoxBench, VTCBench, Asclepius, PlotQA, ChartX, ChartBench, ChartCapDataset, WorldVQA, PuzzleVQA, VisualPuzzles,
     Design2Code, VLADBench, SSIBenchDataset, NPMM, SGI_Bench_Experimental_Reasoning
 ]
 
@@ -1,18 +1,19 @@
-import ast
 from .image_base import ImageBaseDataset
+from .utils.sarena import evaluate_sarena
 from ..smp import *
-from .utils.sarena_mini import evaluate_sarena_mini
 
 
-class SArena_MINI(ImageBaseDataset):
+class SArena(ImageBaseDataset):
 
     TYPE = "VQA"
 
     DATASET_URL = {
+        "SArena": "https://huggingface.co/datasets/JoeLeelyf/SArena-VLMEvalKit/resolve/main/SArena.tsv",
         "SArena_MINI": "https://huggingface.co/datasets/JoeLeelyf/SArena-VLMEvalKit/resolve/main/SArena_MINI.tsv"
     }
 
     DATASET_MD5 = {
+        "SArena": "2a747c13c063a6c9839c66611b61526c",
         "SArena_MINI": "c87fa82819a5fce652df40f6332266ff"
     }
 
@@ -44,4 +45,4 @@ def build_prompt(self, line):
         return msgs
 
     def evaluate(self, eval_file, **judge_kwargs):
-        return evaluate_sarena_mini(eval_file)
+        return evaluate_sarena(eval_file, dataset=self.dataset)
@@ -1,11 +1,13 @@
+from typing import Literal
+
 import torch
-from tqdm import tqdm
 from torch.utils.data import DataLoader
-from torchmetrics.multimodal.clip_score import CLIPScore
 from torchmetrics.functional.multimodal.clip_score import _clip_score_update
+from torchmetrics.multimodal.clip_score import CLIPScore
 from torchvision.transforms import ToTensor
+from tqdm import tqdm
+
 from .base_metric import BaseMetric
-from typing import Literal
 
 
 class CLIPScoreCalculator(BaseMetric):
 
@@ -16,15 +16,19 @@ def __init__(self):
         self.metric = self.calculate_DINOv2_similarity_score
 
     def get_DINOv2_model(self, model_size):
-        if model_size == "small":
-            model_size = "facebook/dinov2-small"
-        elif model_size == "base":
-            model_size = "facebook/dinov2-base"
-        elif model_size == "large":
-            model_size = "facebook/dinov2-large"
-        else:
+        model_map = {
+            "small": "facebook/dinov2-small",
+            "base": "facebook/dinov2-base",
+            "large": "facebook/dinov2-large",
+        }
+        name = model_map.get(model_size)
+        if not name:
             raise ValueError(f"model_size should be either 'small', 'base' or 'large', got {model_size}")
-        return AutoModel.from_pretrained(model_size), AutoImageProcessor.from_pretrained(model_size)
+
+        model = AutoModel.from_pretrained(name)
+        processor = AutoImageProcessor.from_pretrained(name)
+
+        return model, processor
 
     def process_input(self, image, processor):
         if isinstance(image, str):
 
@@ -1,18 +1,42 @@
+import os
+import shutil
 import torch
 import lpips
 
 from tqdm import tqdm
+from vlmeval.smp.file import LMUDataRoot
 from torch.utils.data import DataLoader
 from torchvision.transforms import ToTensor, Normalize
 from .base_metric import BaseMetric
 
 
+def get_lpips_vgg_model(device):
+    """Load LPIPS VGG model, downloading to aux_models if needed."""
+    vgg_path = os.path.join(LMUDataRoot(), 'aux_models', 'vgg.pth')
+
+    if os.path.exists(vgg_path):
+        return lpips.LPIPS(net='vgg', model_path=vgg_path).to(device)
+
+    # Download model (lpips uses torch hub cache)
+    model = lpips.LPIPS(net='vgg').to(device)
+
+    # Copy from torch hub cache to aux_models for future offline use
+    aux_models_dir = os.path.dirname(vgg_path)
+    os.makedirs(aux_models_dir, exist_ok=True)
+
+    cache_path = os.path.expanduser('~/.cache/torch/hub/checkpoints/vgg_net_g.pth')
+    if os.path.exists(cache_path):
+        shutil.copy(cache_path, vgg_path)
+
+    return model
+
+
 class LPIPSCalculator(BaseMetric):
     def __init__(self):
         super().__init__()
         self.class_name = self.__class__.__name__
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.model = lpips.LPIPS(net='vgg').to(self.device)
+        self.model = get_lpips_vgg_model(self.device)
         self.metric = self.LPIPS
         self.to_tensor = ToTensor()
         self.normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 
@@ -1,9 +1,11 @@
+import os
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision
 
 from torch.hub import load_state_dict_from_url
+from vlmeval.smp.file import LMUDataRoot
 
 # Inception weights ported to Pytorch from
 # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
@@ -213,7 +215,16 @@ def fid_inception_v3():
     inception.Mixed_7b = FIDInceptionE_1(1280)
     inception.Mixed_7c = FIDInceptionE_2(2048)
 
-    state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
+    local_path = os.path.join(LMUDataRoot(), 'aux_models', 'pt_inception-2015-12-05-6726825d.pth')
+    if os.path.exists(local_path):
+        state_dict = torch.load(local_path, map_location='cpu', weights_only=True)
+    else:
+        # Ensure directory exists
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        # Download to aux_models directory
+        state_dict = load_state_dict_from_url(
+            FID_WEIGHTS_URL, progress=True, model_dir=os.path.dirname(local_path)
+        )
     inception.load_state_dict(state_dict)
     return inception
 
 
@@ -1,3 +1,4 @@
+import math
 from dataclasses import dataclass
 from typing import Dict, Callable
 
@@ -57,11 +58,11 @@ def calculate_metrics(self, batch):
             print(f"Calculating {metric_name}...")
             if metric_name in ['FID', 'FID-C']:
                 avg_result = metric.calculate_score(batch)
-                if avg_result is not float("nan"):
+                if not math.isnan(avg_result):
                     avg_results_dict[metric_name] = avg_result
             else:
                 avg_result, values = metric.calculate_score(batch)
-                if avg_result is not float("nan"):
+                if not math.isnan(avg_result):
                     avg_results_dict[metric_name] = avg_result
 
         return avg_results_dict
 
@@ -0,0 +1,18 @@
+av
+cairosvg
+cd-fvd
+evaluate==0.4.3
+ftfy
+hpsv2x
+lpips
+matplotlib
+moviepy
+nest_asyncio
+pyppeteer
+regex
+rich==13.9.4
+scikit-image
+svgpathtools==1.6.1
+timm==1.0.15
+torchmetrics
+vtracer
@@ -1,11 +1,13 @@
-import torch
+import os
 
-from tqdm import tqdm
+import torch
 from torch.utils.data import DataLoader
+from tqdm import tqdm
 from transformers import AutoTokenizer
 
-from .base_metric import BaseMetric
+from vlmeval.smp.file import LMUDataRoot
 from .average_meter import AverageMeter
+from .base_metric import BaseMetric
 
 
 class TokenLengthCalculator(BaseMetric):