fix the bug of #79.Unable to replace model

JingTY · HaiHui886 · commit 2ae167682faf · 2025-12-04T05:53:21.000Z
diff --git a/data_engine/ops/mapper/generate_code_qa_pair_mapper.py b/data_engine/ops/mapper/generate_code_qa_pair_mapper.py
@@ -1,49 +1,45 @@
+import json
+import requests
 from typing import Dict
 
 from loguru import logger
 
-from data_engine.utils.availability_utils import AvailabilityChecking
-from data_engine.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, UNFORKABLE, Mapper, Sample, Param, DataType
+from ..base_op import OPERATORS, Mapper, Sample, Param, DataType
 
 DEFAULT_PROMPT_TEMPLATE = """
 为了输出下面代码片段，请生成对应prompt内容，该prompt应该用中文详细描述需求， 比如使用python实现什么功能。请回复：prompt=？
 代码片段：
 {input_data}
 """
 
-OP_NAME = 'generate_code_qa_pair_mapper'
-
-with AvailabilityChecking(['torch', 'transformers'], OP_NAME):
-    import torch
+DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant."
 
-    # avoid hanging when calling model in multiprocessing
-    torch.set_num_threads(1)
+OP_NAME = 'generate_code_qa_pair_mapper'
 
 
-@UNFORKABLE.register_module(OP_NAME)
 @OPERATORS.register_module(OP_NAME)
 class GenerateCodeQAPairMapper(Mapper):
-    _accelerator = 'cuda'
+    """
+    Mapper to generate code QA pairs using remote LLM API.
+    Supports OpenAI-compatible API formats including Qwen, DeepSeek, GPT, etc.
+    """
+    _accelerator = 'cpu'
 
     def __init__(self,
-                 hf_model,
-                 trust_remote_code: bool = True,
-                 prompt_template: str = None,
-                 # {'temperature': 0.2, 'top_k': 10, 'top_p': 0.95}
-                 sampling_params: Dict = {
-                     'temperature': 0.2, 'top_k': 10, 'top_p': 0.95},
+                 model_url: str = 'https://api.deepseek.com/chat/completions',
+                 model_name: str = 'deepseek-chat',
+                 auth_token: str = '',
+                 system_prompt: str = None,
+                 sampling_params: Dict = None,
                  *args,
                  **kwargs):
         """
         Initialization method.
 
-        :param hf_model: Hugginface model id.
-        :param trust_remote_code: passed to transformers
-        :param prompt_template: Prompt template for generate samples.
-            Please make sure the template contains "{augmented_data}",
-            which corresponds to the augmented samples.
+        :param model_url: API endpoint URL (OpenAI-compatible format).
+        :param model_name: Model name to use.
+        :param auth_token: API authentication token.
+        :param system_prompt: System prompt for the model.
         :param sampling_params: Sampling parameters for text generation.
             e.g {'temperature': 0.9, 'top_p': 0.95}
         :param args: extra args
@@ -52,53 +48,131 @@ def __init__(self,
         super().__init__(*args, **kwargs)
         self.num_proc = 1
 
-        if prompt_template is None:
-            prompt_template = DEFAULT_PROMPT_TEMPLATE
+        self.model_url = model_url
+        self.model_name = model_name
+        self.auth_token = auth_token
+
+        if not self.model_url:
+            raise ValueError("model_url is required")
+        if not self.auth_token:
+            raise ValueError("auth_token is required")
 
-        self.prompt_template = prompt_template
+        if system_prompt is None:
+            system_prompt = DEFAULT_SYSTEM_PROMPT
+        self.system_prompt = system_prompt
 
-        self.model_key = prepare_model(
-            model_type='opcsg_inference',
-            pretrained_model_name_or_path=hf_model,
-            trust_remote_code=trust_remote_code)
+        if sampling_params is None:
+            sampling_params = {'temperature': 0.2, 'top_k': 10, 'top_p': 0.95}
         self.sampling_params = sampling_params
 
-    def build_prompt(self, sample, prompt_template):
-        return prompt_template.format(input_data=sample)
+    def build_prompt(self, code_snippet):
+        return DEFAULT_PROMPT_TEMPLATE.format(input_data=code_snippet)
 
     def process(self, sample=None, rank=None):
-        model, _ = get_model(self.model_key, rank=rank)
-        data = sample[self.text_key]
-        input_prompt = self.build_prompt(data,
-                                         self.prompt_template)
-
-        response_str = model.generate(
-            message=input_prompt, sampling_params=self.sampling_params, system_prompt='You are a helpful assistant.')
-        logger.debug(f'input_prompt is: {input_prompt}')
-        logger.debug(f'response_str is: {response_str}')
-        message_list = {self.text_key: {
-            'input': response_str.replace('prompt=', ''), 'response': data}}
-
-        return message_list
+        try:
+            data = sample[self.text_key]
+            input_prompt = self.build_prompt(data)
+
+            messages = [
+                {
+                    "role": "system",
+                    "content": self.system_prompt
+                },
+                {
+                    "role": "user",
+                    "content": input_prompt
+                }
+            ]
+
+            headers = {
+                'Authorization': f'Bearer {self.auth_token}',
+                'Content-Type': 'application/json'
+            }
+
+            request_data = {
+                "model": self.model_name,
+                "messages": messages,
+                "stream": False,
+            }
+            # Merge sampling_params
+            if self.sampling_params:
+                request_data.update(self.sampling_params)
+
+            logger.info(f'Calling API: {self.model_url}, Model: {self.model_name}')
+            logger.debug(f'input_prompt is: {input_prompt}')
+
+            response = requests.post(
+                url=self.model_url,
+                headers=headers,
+                json=request_data,
+                timeout=120
+            )
+            response.raise_for_status()
+
+            result = response.json()
+            
+            if 'choices' not in result:
+                logger.error(f'API response missing "choices" field: {result}')
+                return sample
+                
+            response_str = result['choices'][0]['message']['content']
+
+            logger.debug(f'response_str is: {response_str}')
+
+            # Extract content after "prompt="
+            generated_prompt = response_str.replace('prompt=', '').strip()
+
+            message_list = {
+                self.text_key: {
+                    'input': generated_prompt,
+                    'response': data
+                }
+            }
+
+            return message_list
+
+        except requests.exceptions.RequestException as e:
+            logger.error(f'HTTP request error: {e}')
+            logger.warning(f'API call failed, returning original sample')
+        except (KeyError, IndexError, json.JSONDecodeError) as e:
+            logger.error(f'API response parsing error: {e}')
+            logger.warning(f'Response parsing failed, returning original sample')
+        except Exception as e:
+            logger.error(f'Unexpected error: {e}')
+            logger.warning(f'Exception occurred, returning original sample')
+
+        # Return original sample on failure
+        return sample
 
     @classmethod
     @property
     def description(cls):
-        return """Mapper to generate new instruction data based on code.
-    """
+        return """Code QA pair generator: Generate requirement description prompts from code snippets. Supports OpenAI-compatible APIs including Qwen, DeepSeek, GPT, etc."""
 
     @classmethod
     @property
     def sample(cls):
-        return Sample('def hello_world():\n    print("Hello, World!")\nhello_world()',
-                      'message:[{"input": "create hello word function by python", "response": "def hello_world():\n    print("Hello, World!")\nhello_world()" }]')
+        return Sample(
+            'def hello_world():\n    print("Hello, World!")\nhello_world()',
+            'message:[{"input": "Write a Python function named hello_world that prints Hello, World! and call it", "response": "def hello_world():\\n    print(\\"Hello, World!\\")\\nhello_world()" }]'
+        )
 
     @classmethod
     @property
     def init_params(cls):
         return [
-            Param("hf_model", DataType.STRING, {
-                "AIWizards/Llama2-Chinese-7b-Chat": "AIWizards/Llama2-Chinese-7b-Chat",
-            }, "AIWizards/Llama2-Chinese-7b-Chat"),
-            Param("prompt_template", DataType.STRING, None, None),
+            Param("model_url", DataType.STRING, {
+                "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions": "Qwen API",
+                "https://api.deepseek.com/chat/completions": "DeepSeek API",
+                "https://api.openai.com/v1/chat/completions": "OpenAI API",
+            }, "https://api.deepseek.com/chat/completions"),
+            Param("model_name", DataType.STRING, {
+                "qwen-plus": "qwen-plus",
+                "qwen-max": "qwen-max",
+                "deepseek-chat": "deepseek-chat",
+                "deepseek-reasoner": "deepseek-reasoner",
+                "gpt-4": "gpt-4",
+                "gpt-3.5-turbo": "gpt-3.5-turbo",
+            }, "deepseek-chat"),
+            Param("auth_token", DataType.STRING, {}, ""),
         ]
diff --git a/data_server/algo_templates/mapper/algo_template_mapper.py b/data_server/algo_templates/mapper/algo_template_mapper.py
@@ -1,4 +1,5 @@
 from sqlalchemy.orm import Session
+from sqlalchemy import desc
 from typing import List, Optional, Tuple
 import yaml
 
@@ -141,7 +142,7 @@ def get_templates_by_query(db: Session, user_id: str,
 
     total = query.count()
     
-
-    templates = query.offset((page - 1) * page_size).limit(page_size).all()
+    # desc_by_id
+    templates = query.order_by(desc(AlgoTemplate.id)).offset((page - 1) * page_size).limit(page_size).all()
     
     return templates, total
diff --git a/data_server/database/Initialization_data/operator_config.sql b/data_server/database/Initialization_data/operator_config.sql
@@ -12,7 +12,7 @@
  Target Server Version : 150010 (150010)
  File Encoding         : 65001
 
- Date: 27/11/2025 10:04:18
+ Date: 03/12/2025 15:44:33
 */
 
 
@@ -83,8 +83,6 @@ INSERT INTO "public"."operator_config" VALUES (44, 35, 'chars_to_remove', 'input
 INSERT INTO "public"."operator_config" VALUES (5, 2, 'rep_len', 'number', NULL, '10', '0', NULL, NULL, 'f', 'f', '1', NULL, '2025-07-25 17:12:04.424873', '2025-07-25 17:12:04.424873', NULL);
 INSERT INTO "public"."operator_config" VALUES (6, 2, 'min_ratio', 'slider', NULL, '0', '0', '1', '0.01', 'f', 'f', NULL, NULL, '2025-07-25 17:12:04.424873', '2025-07-25 17:12:04.424873', NULL);
 INSERT INTO "public"."operator_config" VALUES (7, 2, 'max_ratio', 'slider', NULL, '0.5', '0', '1', '0.01', 'f', 'f', NULL, NULL, '2025-07-25 17:12:04.424873', '2025-07-25 17:12:04.424873', NULL);
-INSERT INTO "public"."operator_config" VALUES (19, 8, 'prompt_template', 'input', NULL, NULL, NULL, NULL, NULL, 'f', 'f', NULL, NULL, '2025-07-28 21:56:42.474364', '2025-07-28 21:56:42.474364', NULL);
-INSERT INTO "public"."operator_config" VALUES (18, 8, 'hf_model', 'select', '[23]', '23', NULL, NULL, NULL, 'f', 'f', NULL, NULL, '2025-07-28 21:56:42.474364', '2025-07-28 21:56:42.474364', NULL);
 INSERT INTO "public"."operator_config" VALUES (13, 13, 'max_len', 'number', NULL, '136028', '0', NULL, NULL, 'f', 'f', '1', NULL, '2025-07-25 17:23:47.885255', '2025-07-25 17:23:47.885255', NULL);
 INSERT INTO "public"."operator_config" VALUES (12, 13, 'min_len', 'number', NULL, '10', '0', NULL, NULL, 'f', 'f', '1', NULL, '2025-07-25 17:23:47.885255', '2025-07-25 17:23:47.885255', NULL);
 INSERT INTO "public"."operator_config" VALUES (4, 15, 'max_ratio', 'number', NULL, '999999', NULL, NULL, NULL, 'f', 'f', '1', NULL, '2025-07-25 17:07:16.881312', '2025-07-25 17:07:16.881312', NULL);
@@ -196,6 +194,9 @@ INSERT INTO "public"."operator_config" VALUES (145, 59, 'dimensions', 'number',
 INSERT INTO "public"."operator_config" VALUES (133, 59, 'model_name', 'input', NULL, 'text-embedding-v4', NULL, NULL, NULL, 'f', 'f', NULL, NULL, '2025-09-02 10:34:57.837909', '2025-09-02 10:34:57.837909', NULL);
 INSERT INTO "public"."operator_config" VALUES (146, 59, 'model_url', 'input', NULL, 'https://dashscope.aliyuncs.com/compatible-mode/v1', NULL, NULL, NULL, 'f', 'f', NULL, NULL, '2025-08-25 16:47:28.412958', '2025-08-25 16:47:28.412958', NULL);
 INSERT INTO "public"."operator_config" VALUES (142, 61, 'model_name', 'input', NULL, 'qwen-plus', NULL, NULL, NULL, 'f', 'f', NULL, NULL, '2025-08-26 18:26:29.905881', '2025-08-26 18:26:29.905881', NULL);
+INSERT INTO "public"."operator_config" VALUES (162, 8, 'auth_token', 'input', NULL, NULL, NULL, NULL, NULL, 'f', 'f', NULL, NULL, '2025-12-03 14:55:10', '2025-12-03 14:55:13', NULL);
+INSERT INTO "public"."operator_config" VALUES (18, 8, 'model_name', 'input', NULL, 'deepseek-chat', NULL, NULL, NULL, 'f', 'f', NULL, NULL, '2025-07-28 21:56:42.474364', '2025-07-28 21:56:42.474364', NULL);
+INSERT INTO "public"."operator_config" VALUES (19, 8, 'model_url', 'input', NULL, 'https://www.sophnet.com/api/open-apis/v1/chat/completions', NULL, NULL, NULL, 'f', 'f', NULL, NULL, '2025-07-28 21:56:42.474364', '2025-07-28 21:56:42.474364', NULL);
 
 -- ----------------------------
 -- Indexes structure for table operator_config