1.add a new operator

zhanglongbin · zhanglongbin · commit d713f1ac2e61 · 2025-08-17T18:01:32.000+08:00
2.modify the image loading method
3.Modify the loading method of operator permissions
diff --git a/data_agents/utils/tools/load_samples.py b/data_agents/utils/tools/load_samples.py
@@ -57,7 +57,7 @@ def parse(path: str):
             with open(readme_path, encoding='utf-8') as stream:
                 content = stream.read()
         else:
-            with open(readme_path) as stream:  # 在 Linux 和 macOS 下不显式指定编码
+            with open(readme_path) as stream:
                 content = stream.read()
         plan.readme = content
 
diff --git a/data_engine/exporter/csghub_exporter.py b/data_engine/exporter/csghub_exporter.py
@@ -4,10 +4,8 @@
 from pycsghub.cmd.repo_types import RepoType
 from pycsghub.upload_large_folder.main import upload_large_folder_internal
 
-#from data_celery.mongo_tools.tools import insert_pipline_job_run_task_log_info
 from data_engine.exporter.base_exporter import Exporter
 import os
-import uuid
 import re
 from loguru import logger
 from pycsghub.repository import Repository
@@ -161,7 +159,7 @@ def _export_common(self):
             )
             r.upload()
             logger.info(f'Done push {self.upload_path} to repo: {self.repo_id} with branch: {self.output_branch_name}')
-            # insert_pipline_job_run_task_log_info(job_uid, f'Done push {self.upload_path} to repo: {self.repo_id} with branch: {self.output_branch_name}')
+            #insert_pipline_job_run_task_log_info(job_uid, f'Done push {self.upload_path} to repo: {self.repo_id} with branch: {self.output_branch_name}')
             if os.path.exists(self.repo_work_dir):
                 logger.info(f'Remove {self.repo_work_dir}')
                 shutil.rmtree(self.repo_work_dir)
diff --git a/data_engine/ops/mapper/extract_qa_mapper.py b/data_engine/ops/mapper/extract_qa_mapper.py
@@ -50,39 +50,7 @@ def __init__(self,
                  sampling_params: Dict = {'temperature': 0.3},
                  *args,
                  **kwargs):
-        """
-        Initialization method.
-        :param hf_model: Hugginface model id.
-        :param trust_remote_code: passed to transformers
-        :param pattern: regular expression pattern to search for within text.
-        :param qa_format: Output format of question and answer pair.
-        :param enable_vllm: Whether to use vllm for inference acceleration.
-        :param tensor_parallel_size: It is only valid when enable_vllm is True.
-            The number of GPUs to use for distributed execution with tensor
-            parallelism.
-        :param max_model_len: It is only valid when enable_vllm is True.
-            Model context length. If unspecified, will be automatically
-            derived from the model config.
-        :param max_num_seqs: It is only valid when enable_vllm is True.
-            Maximum number of sequences to be processed in a single iteration.
-        :param sampling_params: Sampling parameters for text generation.
-            e.g {'temperature': 0.9, 'top_p': 0.95}
-        :param args: extra args
-        :param kwargs: extra args
-
-        The default data format parsed by this interface is as follows:
-        Model Input:
-            蒙古国的首都是乌兰巴托（Ulaanbaatar）
-            冰岛的首都是雷克雅未克（Reykjavik）
-        Model Output:
-            蒙古国的首都是乌兰巴托（Ulaanbaatar）
-            冰岛的首都是雷克雅未克（Reykjavik）
-            Human: 请问蒙古国的首都是哪里？
-            Assistant: 你好，根据提供的信息，蒙古国的首都是乌兰巴托（Ulaanbaatar）。
-            Human: 冰岛的首都是哪里呢？
-            Assistant: 冰岛的首都是雷克雅未克（Reykjavik）。
-            ...
-        """
+
 
         super().__init__(*args, **kwargs)
         self.num_proc = 1
diff --git a/data_engine/ops/mapper/nlpcda_zh_mapper.py b/data_engine/ops/mapper/nlpcda_zh_mapper.py
@@ -30,45 +30,7 @@ def __init__(self,
                  replace_equivalent_num: bool = False,
                  *args,
                  **kwargs):
-        """
-        Initialization method. All augmentation methods use default parameters
-        in default. We recommend you to only use 1-3 augmentation methods at a
-        time. Otherwise, the semantics of samples might be changed
-        significantly. **Notice**: some augmentation method might not work for
-        some special texts, so there might be no augmented texts generated.
-
-        :param sequential: whether combine all augmentation methods to a
-            sequence. If it's True, a sample will be augmented by all opened
-            augmentation methods sequentially. If it's False, each opened
-            augmentation method would generate its augmented samples
-            independently.
-        :param aug_num: number of augmented samples to be generated. If
-            `sequential` is True, there will be total aug_num augmented samples
-            generated. If it's False, there will be (aug_num *
-            #opened_aug_method) augmented samples generated.
-        :param keep_original_sample: whether to keep the original sample. If
-            it's set to False, there will be only generated texts in the final
-            datasets and the original texts will be removed. It's True in
-            default.
-        :param replace_similar_word: whether to open the augmentation method of
-            replacing random words with their similar words in the original
-            texts. e.g. "这里一共有5种不同的数据增强方法" --> "这边一共有5种不同的数据增强方法"
-        :param replace_homophone_char: whether to open the augmentation method
-            of replacing random characters with their homophones in the
-            original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的濖据增强方法"
-        :param delete_random_char: whether to open the augmentation method of
-            deleting random characters from the original texts. e.g.
-            "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强"
-        :param swap_random_char: whether to open the augmentation method of
-            swapping random contiguous characters in the original texts. e.g.
-            "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法"
-        :param replace_equivalent_num: whether to open the augmentation method
-            of replacing random numbers with their equivalent representations
-            in the original texts. **Notice**: Only for numbers for now. e.g.
-            "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法"
-        :param args: extra args
-        :param kwargs: extra args
-        """
+
         super().__init__(*args, **kwargs)
 
         self.aug_num = aug_num
diff --git a/data_engine/utils/constant.py b/data_engine/utils/constant.py
@@ -133,6 +133,7 @@ class StatsKeysConstant(object):
     word_rep_ratio = 'word_rep_ratio'
     bloom = 'bloom'
     high_score = 'high_score'
+    embedding = 'embedding'
 
     # image
     aspect_ratios = 'aspect_ratios'
diff --git a/data_server/api/api_router.py b/data_server/api/api_router.py
@@ -29,5 +29,6 @@
 api_router.include_router(operator_permission.router, prefix="/operator_permission", tags=["算子权限相关接口"])
 
 api_router.include_router(op_pic_upload.op_pic_router, prefix="/internal_api", tags=["文件上传接口"])
+api_router.include_router(op_pic_upload.image_getter_router, tags=["文件获取接口"])
 
 api_router.include_router(algo_templates.router, prefix="/algo_templates", tags=["算法模板相关接口"])
diff --git a/data_server/api/endpoints/algo_templates.py b/data_server/api/endpoints/algo_templates.py
@@ -31,7 +31,7 @@ class AlgoTemplateListResponse(BaseModel):
     page_size: int = Field(..., description="每页数量")
 
 
-@router.get("/", response_model=dict, summary="获取算法模板列表")
+@router.get("", response_model=dict, summary="获取算法模板列表")
 async def get_algo_templates(
     user_id: str = Header(..., alias="user_id", description="用户ID"),
     page: int = Query(1, ge=1, description="页码"),
@@ -101,7 +101,7 @@ async def get_algo_template_by_id(
         db.close()
 
 
-@router.post("/", response_model=dict, summary="创建新的算法模板")
+@router.post("", response_model=dict, summary="创建新的算法模板")
 async def create_algo_template(
     template_data: AlgoTemplateCreate,
     user_id: str = Header(..., alias="user_id", description="用户ID"),
diff --git a/data_server/api/endpoints/op_pic_upload.py b/data_server/api/endpoints/op_pic_upload.py
@@ -1,12 +1,16 @@
 from fastapi import APIRouter, UploadFile, File, HTTPException, status, Request
 from typing import Dict, Any
 import os
+import base64
+from pathlib import Path
 from loguru import logger
 from data_server.utils.file_storage import file_storage_manager
 from data_server.schemas.responses import response_success, response_fail
+from data_celery.utils import get_project_root
 
 
 op_pic_router = APIRouter()
+image_getter_router = APIRouter()
 
 
 @op_pic_router.post("/internal_api/upload", summary="上传operator图片")
@@ -78,3 +82,32 @@ async def delete_uploaded_file_by_name(filename: str) -> Dict[str, Any]:
     except Exception as e:
         logger.error(f"删除文件失败: {str(e)}")
         return response_fail(msg=f"删除文件失败: {str(e)}")
+
+
+@image_getter_router.get("/real_static_files/{category}/{filename}", summary="obtain_the_base64_encoding_of_the_image")
+async def get_image_base64(category: str, filename: str):
+    try:
+        project_root = get_project_root()
+        image_path = Path(project_root) / 'attach' / category / filename
+        
+        if not image_path.exists() or not image_path.is_file():
+            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Image not found")
+
+        with open(image_path, "rb") as image_file:
+            encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
+        
+        file_extension = filename.split('.')[-1].lower()
+        mime_type = f"image/{file_extension}"
+        if file_extension == 'svg':
+            mime_type = "image/svg+xml"
+
+        base64_image = encoded_string
+        
+        return response_success(data={base64_image})
+
+    except HTTPException as http_exc:
+        logger.warning(f"failed-to-obtain-the-picture: {http_exc.detail}")
+        raise http_exc
+    except Exception as e:
+        logger.error(f"base64_encoding_failed: {str(e)}")
+        return response_fail(msg=f"failed_to_obtain_the_picture: {str(e)}")
diff --git a/data_server/api/endpoints/operator.py b/data_server/api/endpoints/operator.py
@@ -1,6 +1,9 @@
-from fastapi import FastAPI, APIRouter, Depends, HTTPException, Query, Path
+from fastapi import FastAPI, APIRouter, Depends, HTTPException, Query, Path,Header
 from sqlalchemy.orm import Session
 from typing import List, Dict, Any, Optional, Annotated
+import base64
+from pathlib import Path
+from data_celery.utils import get_project_root
 
 from data_server.database.session import get_sync_session
 
@@ -21,7 +24,7 @@
 
 
 
-@router.post("/", summary="create_operator")
+@router.post("", summary="create_operator")
 def create_operator_api(
     operator_data: OperatorCreateRequest,
     db: Session = Depends(get_sync_session)
@@ -36,7 +39,7 @@ def create_operator_api(
         db.close()
 
 
-@router.get("/", summary="GET_LIST_OF_OPERATORS")
+@router.get("", summary="GET_LIST_OF_OPERATORS")
 def read_operators_api(
     skip: int = 0,
     limit: int = 100,
@@ -45,7 +48,27 @@ def read_operators_api(
 
     try:
         operators = get_operators(db, skip, limit)
-        return response_success(data=operators, msg="获取算子列表成功")
+        operators_data = []
+        project_root = get_project_root()
+        for op in operators:
+            op_dict = op.__dict__
+            pic_base64 = None
+            mime_type = None
+            if op.icon:
+                try:
+                    filename = Path(op.icon).name
+                    image_path = project_root / 'attach' / 'operator' / filename
+                    if image_path.exists() and image_path.is_file():
+                        with open(image_path, "rb") as image_file:
+                            encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
+                        pic_base64 = encoded_string
+                except Exception:
+                    # Ignore errors for individual images
+                    pass
+            op_dict['pic_base64'] = pic_base64
+            operators_data.append(op_dict)
+
+        return response_success(data=operators_data, msg="获取算子列表成功")
     except Exception as e:
         return response_fail(msg=f"获取算子列表失败: {str(e)}")
     finally:
@@ -126,7 +149,7 @@ def get_operator_config_select_option_by_id_api(
         db.close()
 
 
-@router.post("/config_select_options/", summary="添加下拉框选项")
+@router.post("/config_select_options", summary="添加下拉框选项")
 def create_operator_config_select_option_api(
     option: OperatorConfigSelectOptionsCreate,
     db: Session = Depends(get_sync_session)
@@ -148,14 +171,30 @@ def get_operators_grouped_by_type_api(
 
     try:
         grouped_operators = get_operators_grouped_by_type(db)
+        project_root = get_project_root()
+        for group in grouped_operators:
+            for op in group['list']:
+                pic_base64 = None
+                icon = op.get('icon')
+                if icon:
+                    try:
+                        filename = Path(icon).name
+                        image_path = project_root / 'attach' / 'operator' / filename
+                        if image_path.exists() and image_path.is_file():
+                            with open(image_path, "rb") as image_file:
+                                encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
+                            pic_base64 = encoded_string
+                    except Exception:
+                        pass
+                op['pic_base64'] = pic_base64
         return response_success(data=grouped_operators, msg="获取分组算子列表成功")
     except Exception as e:
         return response_fail(msg=f"获取分组算子列表失败: {str(e)}")
     finally:
         db.close()
 
 # find_operator_by_uuid_orgs
-@router.get("/types/grouped-by-condition/", summary="根据算子分类和权限返回算子数据")
+@router.get("/types/grouped-by-condition", summary="根据算子分类和权限返回算子数据")
 def get_operators_grouped_by_condition_api(
     payload: Dict = Depends(get_validated_token_payload),
     db: Session = Depends(get_sync_session),
@@ -174,8 +213,28 @@ def get_operators_grouped_by_condition_api(
             return response_fail("Token中缺少用户信息 (uuid)")
 
         grouped_operators = get_operators_grouped_by_condition(db, user_id, paths)
+        project_root = get_project_root()
+        for group in grouped_operators:
+            for op in group['list']:
+                pic_base64 = None
+                icon = op.get('icon')
+                if icon:
+                    try:
+                        filename = Path(icon).name
+                        image_path = project_root / 'attach' / 'operator' / filename
+                        if image_path.exists() and image_path.is_file():
+                            with open(image_path, "rb") as image_file:
+                                encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
+                            pic_base64 = encoded_string
+                    except Exception:
+                        pass
+                op['pic_base64'] = pic_base64
         return response_success(data=grouped_operators, msg="获取分组算子列表成功")
     except Exception as e:
         return response_fail(msg=f"获取分组算子列表失败: {str(e)}")
     finally:
         db.close()
+
+@router.get("/isAdmin/torf")
+def get_isAdmin_true_or_false(isadmin: str = Header(..., alias="isadmin", description="是否管理员")):
+    return response_success(data={"isadmin":isadmin})
diff --git a/data_server/api/endpoints/operator_permission.py b/data_server/api/endpoints/operator_permission.py
@@ -15,7 +15,7 @@
 
 
 
-@router.post("/", summary="创建算子权限")
+@router.post("", summary="创建算子权限")
 def create_permission_api(request_data: OperatorPermissionCreateRequest, db: Session = Depends(get_sync_session)):
 
     try:
@@ -102,7 +102,7 @@ def create_permission_api(request_data: OperatorPermissionCreateRequest, db: Ses
 
 
 
-@router.get("/", summary="获取权限列表")
+@router.get("", summary="获取权限列表")
 def read_permissions_api(skip: int = 0, limit: int = 100, db: Session = Depends(get_sync_session)):
 
     try:
diff --git a/data_server/database/initializer.py b/data_server/database/initializer.py
@@ -171,25 +171,17 @@ def fix_quoted_strings(values_part):
                 in_string = True
                 string_content = ""
             elif char == "'" and in_string:
-
+                # When encountering single quotes, check if they are escaped single quotes
                 if i + 1 < len(values_part) and values_part[i + 1] == "'":
-
-                    string_content += "''"
+                    # This is the escaped single quote "", stored as a single single quote
+                    string_content += "'"
                     i += 1
                 else:
-
+                    # string_end
                     in_string = False
-
-                    if ("''" in string_content or 
-                        "href=" in string_content or 
-                        len(string_content) > 100 or
-                        any(c in string_content for c in ['<', '>', '"', '\\'])):
-
-                        dollar_tag = f"$tag{len(result)}$"
-                        current_token += f"{dollar_tag}{string_content}{dollar_tag}"
-                    else:
-
-                        current_token += f"'{string_content}'"
+                    # Always use single quotes and handle the internal single quote escape
+                    escaped_content = string_content.replace("'", "''")
+                    current_token += f"'{escaped_content}'"
                     string_content = ""
             elif in_string:
                 string_content += char
diff --git a/data_server/database/session.py b/data_server/database/session.py
diff --git a/data_server/logic/utils.py b/data_server/logic/utils.py
diff --git a/data_server/main.py b/data_server/main.py
diff --git a/data_server/operator/mapper/operator_mapper.py b/data_server/operator/mapper/operator_mapper.py
diff --git a/tests/ops/mapper/test_text_make_cosmopedia.py b/tests/ops/mapper/test_text_make_cosmopedia.py