@@ -71,14 +71,27 @@ def __init__(
7171 logger .info ('Setting up data ingester...' )
7272 insert_pipline_job_run_task_log_info (self .job_uid ,
7373 'Setting up data ingester...' )
74- # Only have one embeded ingester: from csghub
75- self .ingester = load_ingester (
76- dataset_path = self .cfg .dataset_path ,
77- repo_id = self .cfg .repo_id ,
78- branch = self .cfg .branch ,
79- user_name = self .user_name ,
80- user_token = self .user_token
81- )
74+
75+ # Check if this is the specific output_only tool by tool name
76+ tool_name = getattr (self .cfg , 'tool_name' , '' )
77+ is_specific_output_only = (tool_name == 'template_executor_06_common_internal' )
78+
79+ # normal_logic
80+ if not is_specific_output_only :
81+ # Only have one embeded ingester: from csghub
82+ self .ingester = load_ingester (
83+ dataset_path = self .cfg .dataset_path ,
84+ repo_id = self .cfg .repo_id ,
85+ branch = self .cfg .branch ,
86+ user_name = self .user_name ,
87+ user_token = self .user_token
88+ )
89+ # skip_create_ingester
90+ else :
91+ logger .info ('Skipping ingester setup for output_only tool' )
92+ insert_pipline_job_run_task_log_info (self .job_uid ,
93+ 'Skipping ingester setup for output_only tool' )
94+ self .ingester = None
8295 # assign src_path as dataset_path to format creation
8396
8497 # whether to use checkpoint mechanism. If it's true, Executor will
@@ -193,44 +206,62 @@ def run(self, load_data_np=None):
193206 :return: processed dataset.
194207 """
195208 # 0. ingest data
196- with TRACE_HELPER .trace_block (
197- "ingest" ,
198- parent = get_telemetry_envelope_metadata (),
199- ):
200- self .src_path = self .ingester .ingest ()
201- logger .info (f'Data ingested from { self .src_path } ' )
209+ # Skip data ingestion for specific output_only tool
210+ if self .ingester is not None :
211+ with TRACE_HELPER .trace_block (
212+ "ingest" ,
213+ parent = get_telemetry_envelope_metadata (),
214+ ):
215+ self .src_path = self .ingester .ingest ()
216+ logger .info (f'Data ingested from { self .src_path } ' )
217+ insert_pipline_job_run_task_log_info (self .job_uid ,
218+ f'Data ingested from { self .src_path } ' )
219+ else :
220+ logger .info ('Skipping data ingestion for output_only tool' )
202221 insert_pipline_job_run_task_log_info (self .job_uid ,
203- f'Data ingested from { self .src_path } ' )
222+ 'Skipping data ingestion for output_only tool' )
223+ self .src_path = None
204224 # set src_path to format, let format continue it's job
205225
206- # 1. setup formatter
207- with TRACE_HELPER .trace_block (
208- "format" ,
209- parent = get_telemetry_envelope_metadata (),
210- ):
211- logger .info ('Setting up data formatter...' )
212- insert_pipline_job_run_task_log_info (self .job_uid ,
213- 'Setting up data formatter...' )
214- self .formatter = load_formatter (
215- self .src_path ,
216- self .cfg .generated_dataset_config ,
217- self .cfg .text_keys , self .cfg .suffixes ,
218- self .cfg .add_suffix
219- )
220-
221- # 2. format data
222- if self .cfg .use_checkpoint and self .ckpt_manager .ckpt_available :
223- logger .info ('Loading dataset from checkpoint...' )
224- insert_pipline_job_run_task_log_info (self .job_uid ,
225- 'Loading dataset from checkpoint...' )
226- dataset = self .ckpt_manager .load_ckpt ()
227- else :
228- logger .info ('Loading dataset from data formatter...' )
226+ # 1. setup formatter and load data (skip for output_only tools)
227+ if self .ingester is not None :
228+ with TRACE_HELPER .trace_block (
229+ "format" ,
230+ parent = get_telemetry_envelope_metadata (),
231+ ):
232+ logger .info ('Setting up data formatter...' )
229233 insert_pipline_job_run_task_log_info (self .job_uid ,
230- 'Loading dataset from data formatter...' )
231- if load_data_np is None :
232- load_data_np = self .cfg .np
233- dataset = self .formatter .load_dataset (load_data_np , self .cfg )
234+ 'Setting up data formatter...' )
235+ self .formatter = load_formatter (
236+ self .src_path ,
237+ self .cfg .generated_dataset_config ,
238+ self .cfg .text_keys , self .cfg .suffixes ,
239+ self .cfg .add_suffix
240+ )
241+
242+ # 2. format data
243+ if self .cfg .use_checkpoint and self .ckpt_manager .ckpt_available :
244+ logger .info ('Loading dataset from checkpoint...' )
245+ insert_pipline_job_run_task_log_info (self .job_uid ,
246+ 'Loading dataset from checkpoint...' )
247+ dataset = self .ckpt_manager .load_ckpt ()
248+ else :
249+ logger .info ('Loading dataset from data formatter...' )
250+ insert_pipline_job_run_task_log_info (self .job_uid ,
251+ 'Loading dataset from data formatter...' )
252+ if load_data_np is None :
253+ load_data_np = self .cfg .np
254+ dataset = self .formatter .load_dataset (load_data_np , self .cfg )
255+ else :
256+ logger .info ('Skipping data formatting and loading for output_only tool' )
257+ insert_pipline_job_run_task_log_info (self .job_uid ,
258+ 'Skipping data formatting and loading for output_only tool' )
259+ # Create an empty dataset for output_only tools
260+ import datasets
261+ # Create an empty Arrow table with basic schema
262+ empty_table = datasets .Dataset .from_dict ({})
263+ from data_engine .core .data import NestedDataset
264+ dataset = NestedDataset (empty_table )
234265
235266 # 3. extract processes
236267 logger .info ('Preparing process operators...' )
@@ -242,10 +273,11 @@ def run(self, load_data_np=None):
242273 # 4. data process
243274 # - If tracer is open, trace each op after it's processed
244275 # - If checkpoint is open, clean the cache files after each process
276+ dataset_count = len (dataset ) if dataset is not None else 0
245277 with TRACE_HELPER .trace_block (
246278 "run" ,
247279 parent = get_telemetry_envelope_metadata (),
248- extraAttributes = {"dataset_count" : len ( dataset ) }
280+ extraAttributes = {"dataset_count" : dataset_count }
249281 ):
250282 logger .info ('Processing data...' )
251283 insert_pipline_job_run_task_log_info (self .job_uid ,
0 commit comments