open-compass
diff --git a/‎vlmeval/api/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎vlmeval/api/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vlmeval/api/gpt.py‎
Lines changed: 12 additions & 1 deletion b/‎vlmeval/api/gpt.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎vlmeval/api/sensechat_vision.py‎
Lines changed: 107 additions & 0 deletions b/‎vlmeval/api/sensechat_vision.py‎
Lines changed: 107 additions & 0 deletions
@@ -7,7 +7,7 @@
 from .reka import Reka
 from .glm_vision import GLMVisionAPI
 from .cloudwalk import CWWrapper
-from .sensechat_vision import SenseChatVisionAPI
+from .sensechat_vision import SenseChatVisionAPI, SenseChatVisionV2API
 from .siliconflow import SiliconFlowAPI, TeleMMAPI
 from .telemm import TeleMM2_API
 from .telemm_thinking import TeleMM2Thinking_API
@@ -40,5 +40,5 @@
     'TaichuVLAPI', 'TaichuVLRAPI', 'DoubaoVL', "MUGUAPI", 'KimiVLAPIWrapper', 'KimiVLAPI',
     'RBdashMMChat3_API', 'RBdashChat3_5_API', 'RBdashMMChat3_78B_API', 'RBdashMMChat3_5_38B_API',
     'VideoChatOnlineV2API', 'TeleMM2_API', 'TeleMM2Thinking_API', 'TogetherAPI', 'GCPVertexAPI',
-    'BedrockAPI'
+    'BedrockAPI', 'SenseChatVisionV2API'
 ]
@@ -2,6 +2,7 @@
 import os
 import sys
 from .base import BaseAPI
+import math
 
 APIBASES = {
     'OFFICIAL': 'https://api.openai.com/v1/chat/completions',
@@ -44,6 +45,7 @@ def __init__(self,
                  api_base: str = None,
                  max_tokens: int = 2048,
                  img_size: int = -1,
+                 total_img_size: int = -1,
                  img_detail: str = 'low',
                  use_azure: bool = False,
                  **kwargs):
@@ -109,6 +111,8 @@ def __init__(self,
         self.key = key
         assert img_size > 0 or img_size == -1
         self.img_size = img_size
+        assert total_img_size > 0 or total_img_size == -1
+        self.total_img_size = total_img_size
         assert img_detail in ['high', 'low']
         self.img_detail = img_detail
         self.timeout = timeout
@@ -160,6 +164,7 @@ def __init__(self,
     def prepare_itlist(self, inputs):
         assert np.all([isinstance(x, dict) for x in inputs])
         has_images = np.sum([x['type'] == 'image' for x in inputs])
+        image_num = len([x['type'] == 'image' for x in inputs])
         if has_images:
             content_list = []
             for msg in inputs:
@@ -168,7 +173,13 @@ def prepare_itlist(self, inputs):
                 elif msg['type'] == 'image':
                     from PIL import Image
                     img = Image.open(msg['value'])
-                    b64 = encode_image_to_base64(img, target_size=self.img_size)
+                    target_size = math.inf
+                    if self.img_size > 0:
+                        target_size = self.img_size
+                    if self.total_img_size > 0:
+                        target_size = min(target_size, int(self.img_size / (image_num**0.5)))
+                    target_size = -1 if math.isinf(target_size) else target_size
+                    b64 = encode_image_to_base64(img, target_size=target_size)
                     img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail)
                     content_list.append(dict(type='image_url', image_url=img_struct))
         else:
 
@@ -305,3 +305,110 @@ def generate_inner(self, inputs, **kwargs) -> str:
 class SenseChatVisionAPI(SenseChatVisionWrapper):
     def generate(self, message, dataset=None):
         return super(SenseChatVisionAPI, self).generate(message, dataset=dataset)
+
+
+class SenseChatVisionV2API(BaseAPI):
+
+    is_api: bool = True
+
+    def __init__(self,
+                 model: str = 'SenseNova-V6-5-Pro-20251215',
+                 retry: int = 5,
+                 key: str = None,
+                 verbose: bool = False,
+                 system_prompt: str = None,
+                 temperature: float = 0,
+                 timeout: int = 300,
+                 api_base: str = "https://api.sensenova.cn/compatible-mode/v2/chat/completions",
+                 max_completion_tokens: int = 4096,
+                 img_size: int = -1,
+                 **kwargs):
+
+        self.model = model
+        self.fail_msg = 'Failed to obtain answer via API. '
+        self.max_completion_tokens = max_completion_tokens
+        self.temperature = temperature
+        self.api_base = api_base
+        self.key = key
+        assert img_size > 0 or img_size == -1
+        self.img_size = img_size
+        self.timeout = timeout
+        super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
+
+        self.logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}')
+
+    def generate(self, message, dataset=None):
+        return super(SenseChatVisionV2API, self).generate(message)
+
+    def prepare_itlist(self, inputs):
+        import numpy as np
+        from vlmeval.smp import encode_image_to_base64
+
+        assert np.all([isinstance(x, dict) for x in inputs])
+        has_images = np.sum([x['type'] == 'image' for x in inputs])
+        image_num = len([x['type'] == 'image' for x in inputs])
+        if has_images:
+            content_list = []
+            for msg in inputs:
+                if msg['type'] == 'text':
+                    content_list.append(dict(type='text', text=msg['value']))
+                elif msg['type'] == 'image':
+                    from PIL import Image
+                    img = Image.open(msg['value'])
+                    b64 = encode_image_to_base64(img, target_size=int(self.img_size / (image_num ** 0.5)))
+                    img_struct = dict(url=f'data:image/jpeg;base64,{b64}')
+                    content_list.append(dict(type='image_url', image_url=img_struct))
+        else:
+            assert all([x['type'] == 'text' for x in inputs])
+            text = '\n'.join([x['value'] for x in inputs])
+            content_list = [dict(type='text', text=text)]
+        return content_list
+
+    def prepare_inputs(self, inputs):
+        input_msgs = []
+        if self.system_prompt is not None:
+            input_msgs.append(dict(role='system', content=self.system_prompt))
+        assert isinstance(inputs, list) and isinstance(inputs[0], dict)
+        assert all(['type' in x for x in inputs]) or all(['role' in x for x in inputs]), inputs
+        if 'role' in inputs[0]:
+            assert inputs[-1]['role'] == 'user', inputs[-1]
+            for item in inputs:
+                input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
+        else:
+            input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
+        return input_msgs
+
+    def generate_inner(self, inputs, **kwargs) -> str:
+        import json
+        input_msgs = self.prepare_inputs(inputs)
+
+        headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'}
+
+        payload = dict(model=self.model, messages=input_msgs, stream=False, **kwargs)
+
+        proxies = {}
+        if os.getenv('http_proxy'):
+            proxies['http'] = os.getenv('http_proxy')
+        if os.getenv('https_proxy'):
+            proxies['https'] = os.getenv('https_proxy')
+        proxies = proxies or None
+
+        response = requests.post(
+            self.api_base,
+            headers=headers,
+            data=json.dumps(payload),
+            proxies=proxies,
+            timeout=self.timeout * 1.1,
+        )
+        ret_code = response.status_code
+        ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
+        answer = self.fail_msg
+        try:
+            resp_struct = json.loads(response.text)
+            answer = resp_struct['choices'][0]['message']['content'].strip()
+        except Exception as err:
+            if self.verbose:
+                self.logger.error(f'{type(err)}: {err}')
+                self.logger.error(response.text if hasattr(response, 'text') else response)
+
+        return ret_code, answer, response