OpenCSGs
diff --git a/‎configs/templates/05-fineweb-edu-chinese.yaml‎
Lines changed: 24 additions & 0 deletions b/‎configs/templates/05-fineweb-edu-chinese.yaml‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎configs/templates/06-smoltalk-chinese.yaml‎
Lines changed: 25 additions & 0 deletions b/‎configs/templates/06-smoltalk-chinese.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎data_engine/config/config.py‎
Lines changed: 4 additions & 0 deletions b/‎data_engine/config/config.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎data_engine/core/executor.py‎
Lines changed: 75 additions & 43 deletions b/‎data_engine/core/executor.py‎
Lines changed: 75 additions & 43 deletions
diff --git a/‎data_engine/core/executor_tools.py‎
Lines changed: 1 addition & 1 deletion b/‎data_engine/core/executor_tools.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data_engine/exporter/csghub_exporter.py‎
Lines changed: 2 additions & 0 deletions b/‎data_engine/exporter/csghub_exporter.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎data_engine/format/load.py‎
Lines changed: 2 additions & 0 deletions b/‎data_engine/format/load.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎data_engine/ops/deduplicator/dedup_and_save_deduplicator.py‎
Lines changed: 35 additions & 5 deletions b/‎data_engine/ops/deduplicator/dedup_and_save_deduplicator.py‎
Lines changed: 35 additions & 5 deletions
@@ -0,0 +1,24 @@
+name: '教育价值评估并打分'
+description: '利用大模型对一部分文本进行教育价值评估并打分，根据分数区间进行过滤。'
+type: 'data_refine'
+buildin: true
+project_name: 'dataflow-demo-process'
+dataset_path: '/path/to/your/dataset'
+export_path: '/path/to/your/dataset.jsonl'
+np: 1
+open_tracer: false
+trace_num: 3
+process:
+  - annotate_edu_train_bert_scorer_mapper:
+      auth_token: ''
+      model_url: 'https://dashscope.aliyuncs.com/compatible-mode/v1'
+      model_name: 'text-embedding-v4'
+      dimensions: 1024
+      query_text: 'What is Deep Learning?'
+  - text_high_score_filter:
+      score_field: 'text_score'
+      min_score: 0
+      max_score: 5
+  - text_bloom_filter:
+      hash_func: md5
+      initial_capacity: 100
@@ -0,0 +1,25 @@
+name: '生成高质量中文多轮对话数据集'
+description: 'High-quality Chinese multi-round dialogue datasets are produced by automatically generating large models and through quality scoring and semantic deduplication screening.'
+type: 'data_refine'
+buildin: true
+project_name: 'dataflow-demo-process'
+dataset_path: ''
+exprot_path: '/path/to/your/dataset.jsonl'
+np: 1
+open_tracer: false
+trace_num: 3
+process:
+  - pipeline_magpie_zh_mapper:
+      auth_token: ''
+      model_url: 'https://dashscope.aliyuncs.com/compatible-mode/v1'
+      model_name: 'qwen-plus'
+  - gather_generated_data_filter:
+  - encode_and_get_nearest_mapper:
+      auth_token: ''
+      model_url: 'https://dashscope.aliyuncs.com/compatible-mode/v1'
+      model_name: 'text-embedding-v4'
+      dimensions: 1024
+  - dedup_and_save_deduplicator:
+      similarity_threshold: 0.5
+      nn_indices_key: 'nn_indices'
+      nn_scores_key: 'nn_scores'
@@ -73,6 +73,10 @@ def init_configs(args=None,redirect=True):
                         type=str,
                         default='hello_world',
                         help='Name of your data process project.')
+    parser.add_argument('--tool_name',
+                        type=str,
+                        default='',
+                        help='Name of the tool being executed.')
     parser.add_argument(
         '--executor_type',
         type=str,
 
@@ -71,14 +71,27 @@ def __init__(
         logger.info('Setting up data ingester...')
         insert_pipline_job_run_task_log_info(self.job_uid,
                                              'Setting up data ingester...')
-        # Only have one embeded ingester: from csghub
-        self.ingester = load_ingester(
-            dataset_path = self.cfg.dataset_path, 
-            repo_id = self.cfg.repo_id,
-            branch = self.cfg.branch,
-            user_name = self.user_name,
-            user_token = self.user_token
-        )
+        
+        # Check if this is the specific output_only tool by tool name
+        tool_name = getattr(self.cfg, 'tool_name', '')
+        is_specific_output_only = (tool_name == 'template_executor_06_common_internal')
+
+        # normal_logic
+        if not is_specific_output_only:
+            # Only have one embeded ingester: from csghub
+            self.ingester = load_ingester(
+                dataset_path = self.cfg.dataset_path, 
+                repo_id = self.cfg.repo_id,
+                branch = self.cfg.branch,
+                user_name = self.user_name,
+                user_token = self.user_token
+            )
+        # skip_create_ingester
+        else:
+            logger.info('Skipping ingester setup for output_only tool')
+            insert_pipline_job_run_task_log_info(self.job_uid,
+                                                 'Skipping ingester setup for output_only tool')
+            self.ingester = None
         # assign src_path as dataset_path to format creation
 
         # whether to use checkpoint mechanism. If it's true, Executor will
@@ -193,44 +206,62 @@ def run(self, load_data_np=None):
         :return: processed dataset.
         """
         # 0. ingest data
-        with TRACE_HELPER.trace_block(
-            "ingest",
-            parent=get_telemetry_envelope_metadata(),
-        ):
-            self.src_path = self.ingester.ingest()
-            logger.info(f'Data ingested from {self.src_path}')
+        # Skip data ingestion for specific output_only tool
+        if self.ingester is not None:
+            with TRACE_HELPER.trace_block(
+                "ingest",
+                parent=get_telemetry_envelope_metadata(),
+            ):
+                self.src_path = self.ingester.ingest()
+                logger.info(f'Data ingested from {self.src_path}')
+                insert_pipline_job_run_task_log_info(self.job_uid,
+                                                     f'Data ingested from {self.src_path}')
+        else:
+            logger.info('Skipping data ingestion for output_only tool')
             insert_pipline_job_run_task_log_info(self.job_uid,
-                                                 f'Data ingested from {self.src_path}')
+                                                 'Skipping data ingestion for output_only tool')
+            self.src_path = None
         # set src_path to format, let format continue it's job
 
-        # 1. setup formatter
-        with TRACE_HELPER.trace_block(
-            "format",
-            parent=get_telemetry_envelope_metadata(),
-        ):
-            logger.info('Setting up data formatter...')
-            insert_pipline_job_run_task_log_info(self.job_uid,
-                                                 'Setting up data formatter...')
-            self.formatter = load_formatter(
-                self.src_path,
-                self.cfg.generated_dataset_config,
-                self.cfg.text_keys, self.cfg.suffixes,
-                self.cfg.add_suffix
-            )
-
-            # 2. format data
-            if self.cfg.use_checkpoint and self.ckpt_manager.ckpt_available:
-                logger.info('Loading dataset from checkpoint...')
-                insert_pipline_job_run_task_log_info(self.job_uid,
-                                                     'Loading dataset from checkpoint...')
-                dataset = self.ckpt_manager.load_ckpt()
-            else:
-                logger.info('Loading dataset from data formatter...')
+        # 1. setup formatter and load data (skip for output_only tools)
+        if self.ingester is not None:
+            with TRACE_HELPER.trace_block(
+                "format",
+                parent=get_telemetry_envelope_metadata(),
+            ):
+                logger.info('Setting up data formatter...')
                 insert_pipline_job_run_task_log_info(self.job_uid,
-                                                     'Loading dataset from data formatter...')
-                if load_data_np is None:
-                    load_data_np = self.cfg.np
-                dataset = self.formatter.load_dataset(load_data_np, self.cfg)
+                                                     'Setting up data formatter...')
+                self.formatter = load_formatter(
+                    self.src_path,
+                    self.cfg.generated_dataset_config,
+                    self.cfg.text_keys, self.cfg.suffixes,
+                    self.cfg.add_suffix
+                )
+
+                # 2. format data
+                if self.cfg.use_checkpoint and self.ckpt_manager.ckpt_available:
+                    logger.info('Loading dataset from checkpoint...')
+                    insert_pipline_job_run_task_log_info(self.job_uid,
+                                                         'Loading dataset from checkpoint...')
+                    dataset = self.ckpt_manager.load_ckpt()
+                else:
+                    logger.info('Loading dataset from data formatter...')
+                    insert_pipline_job_run_task_log_info(self.job_uid,
+                                                         'Loading dataset from data formatter...')
+                    if load_data_np is None:
+                        load_data_np = self.cfg.np
+                    dataset = self.formatter.load_dataset(load_data_np, self.cfg)
+        else:
+            logger.info('Skipping data formatting and loading for output_only tool')
+            insert_pipline_job_run_task_log_info(self.job_uid,
+                                                 'Skipping data formatting and loading for output_only tool')
+            # Create an empty dataset for output_only tools
+            import datasets
+            # Create an empty Arrow table with basic schema
+            empty_table = datasets.Dataset.from_dict({})
+            from data_engine.core.data import NestedDataset
+            dataset = NestedDataset(empty_table)
 
         # 3. extract processes
         logger.info('Preparing process operators...')
@@ -242,10 +273,11 @@ def run(self, load_data_np=None):
         # 4. data process
         # - If tracer is open, trace each op after it's processed
         # - If checkpoint is open, clean the cache files after each process
+        dataset_count = len(dataset) if dataset is not None else 0
         with TRACE_HELPER.trace_block(
             "run",
             parent=get_telemetry_envelope_metadata(),
-            extraAttributes={"dataset_count": len(dataset)}
+            extraAttributes={"dataset_count": dataset_count}
         ):
             logger.info('Processing data...')
             insert_pipline_job_run_task_log_info(self.job_uid,
 
@@ -24,6 +24,7 @@ def __init__(self, *, tool_def: Tool_def, params: ExecutedParams):
         """
         self.tool_def = tool_def
         self.executed_params = params
+
         logger.info(f'Using user_id={self.executed_params.user_id}, '
                     f'user_name={self.executed_params.user_name}, '
                     f'user_token={"xxxxxx" if self.executed_params.user_token is not None and len(self.executed_params.user_token)>0 else None}')
@@ -49,7 +50,6 @@ def run(self):
         """
         # 1. setup tool
         logger.info('Preparing tool...')
-
         tool_obj: TOOL = load_tool(self.tool_def, self.executed_params)
 
         with TRACE_HELPER_TOOL.trace_block(
 
@@ -200,6 +200,8 @@ def find_next_version(self, origin_branch: str, valid_branches: List):
         for b in valid_branches:
             if origin_branch == "main" and re.match(r"^v\d+", b):
                 numStr = b.split(".")[0][1:]
+                if not numStr.isdigit():
+                    continue
                 num = int(numStr)
                 latestNum = max(latestNum, num)
             elif b.startswith(origin_branch) and len(b) > len(origin_branch):
 
@@ -7,6 +7,7 @@ def load_formatter(dataset_path,
                    text_keys=None,
                    suffixes=[],
                    add_suffix=False,
+                    max_samples=None,
                    **kwargs) -> BaseFormatter:
     """
     Load mixture formatter for multiple different data formats with an optional
@@ -37,5 +38,6 @@ def load_formatter(dataset_path,
                                  text_keys=text_keys,
                                  suffixes=suffixes,
                                  add_suffix=add_suffix,
+                                 max_samples=max_samples,
                                  **kwargs)
     return formatter
@@ -2,7 +2,7 @@
 import numpy as np
 from loguru import logger
 
-from data_engine.utils.constant import HashKeys
+from data_engine.utils.constant import HashKeys, Fields, StatsKeys
 from ..base_op import OPERATORS, Deduplicator, Sample, Param, DataType
 
 OP_NAME = 'dedup_and_save_deduplicator'
@@ -17,31 +17,38 @@ class DedupAndSaveDeduplicator(Deduplicator):
     """
 
     def __init__(self, 
-                 similarity_threshold: float = 0.95,
+                 similarity_threshold: float = 0.5,
                  nn_indices_key: str = 'nn_indices',
                  nn_scores_key: str = 'nn_scores',
+                 fields_to_filter: list = None,
                  *args, 
                  **kwargs):
         super().__init__(*args, **kwargs)
         self.similarity_threshold = similarity_threshold
         self.nn_indices_key = nn_indices_key
         self.nn_scores_key = nn_scores_key
+        self.fields_to_filter = fields_to_filter or ['embedding', 'nn_indices', 'nn_scores', 'text', 'instruction', 'response']
 
     def compute_hash(self, sample):
         # This method is a placeholder to fit the framework.
         # The actual logic doesn't rely on this hash.
         if self.nn_indices_key not in sample or self.nn_scores_key not in sample:
             sample[self.nn_indices_key] = [[]]
             sample[self.nn_scores_key] = [[]]
-        sample[HashKeys.similarity_hash] = f"similarity_data_{id(sample)}"
+        # Do not create the similarity_hash field because the actual deduplication logic does not require it
+        # sample[HashKeys.similarity_hash] = f"similarity_data_{id(sample)}"
         return sample
 
     def process(self, dataset, show_num=0):
+        print(f"[dedup_and_save_deduplicator] Input: {len(dataset)} samples")
+        
         if len(dataset) <= 1:
+            print(f"[dedup_and_save_deduplicator] Output: {len(dataset)} samples (no deduplication needed)")
             return dataset, {}
 
         # Convert dataset to pandas DataFrame for easier graph processing
         df = dataset.to_pandas()
+        print(f"[dedup_and_save_deduplicator] Processing similarity graph with threshold: {self.similarity_threshold}")
 
         # Create a graph and add all samples as nodes
         G = nx.Graph()
@@ -77,6 +84,7 @@ def process(self, dataset, show_num=0):
 
         # Filter the original dataset to keep only the selected samples
         filtered_dataset = dataset.select(indices_to_keep)
+        print(f"[dedup_and_save_deduplicator] Output: {len(filtered_dataset)} samples after deduplication (removed {len(dataset) - len(filtered_dataset)} duplicates)")
 
         # For tracing, sample some duplicate pairs from components with more than one member
         dup_pairs = {}
@@ -89,7 +97,28 @@ def process(self, dataset, show_num=0):
                     dup_pairs[group_key] = [dataset[i] for i in sorted_component[:2]]
                     processed_components += 1
 
-        return filtered_dataset, dup_pairs
+        print(f"[dedup_and_save_deduplicator] Found {len(connected_components)} connected components")
+
+        # Unified processing of field filtering - Move the specified field to stats
+        def move_fields_to_stats(sample):
+            if Fields.stats not in sample:
+                sample[Fields.stats] = {}
+
+            # move_the_specified_field_to_stats
+            for field in self.fields_to_filter:
+                if field in sample:
+                    # Obtain the corresponding StatsKeys constant based on the field name
+                    stats_key = getattr(StatsKeys, field, field)
+                    sample[Fields.stats][stats_key] = sample[field]
+                    del sample[field]
+            
+            return sample
+
+        # applicationFieldFiltering
+        final_dataset = filtered_dataset.map(move_fields_to_stats)
+        print(f"[dedup_and_save_deduplicator] Filtered fields {self.fields_to_filter} to stats")
+        
+        return final_dataset, dup_pairs
 
     @classmethod
     @property
@@ -119,7 +148,8 @@ def sample(cls):
     @property
     def init_params(cls):
         return [
-            Param("similarity_threshold", DataType.FLOAT, {}, 0.95),
+            Param("similarity_threshold", DataType.FLOAT, {}, 0.5),
             Param("nn_indices_key", DataType.STRING, {}, "nn_indices"),
             Param("nn_scores_key", DataType.STRING, {}, "nn_scores"),
+            Param("fields_to_filter", DataType.LIST, {}, ["embedding", "nn_indices", "nn_scores", "text", "instruction", "response"]),
         ]