OpenCSGs
diff --git a/‎Dockerfile‎
Lines changed: 0 additions & 2 deletions b/‎Dockerfile‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 13 deletions b/‎README.md‎
Lines changed: 2 additions & 13 deletions
diff --git a/‎attach/operator/00ec2df9-63ce-4167-9b0f-b50a910157fd.png‎
3.06 KB b/‎attach/operator/00ec2df9-63ce-4167-9b0f-b50a910157fd.png‎
3.06 KB
diff --git a/‎attach/operator/376559b2-b641-4318-9f2f-b854b7ccc1e4.png‎
3.35 KB b/‎attach/operator/376559b2-b641-4318-9f2f-b854b7ccc1e4.png‎
3.35 KB
diff --git a/‎attach/operator/789a36c3-9b6b-4a63-86e0-66a2029fadca.png‎
2.8 KB b/‎attach/operator/789a36c3-9b6b-4a63-86e0-66a2029fadca.png‎
2.8 KB
diff --git a/‎attach/operator/83e4db09-5585-418a-8cd2-8a56093436bb.png‎
3.36 KB b/‎attach/operator/83e4db09-5585-418a-8cd2-8a56093436bb.png‎
3.36 KB
diff --git a/‎attach/operator/e6aebe22-3205-4e43-b73f-d221a46b741b.png‎
3.01 KB b/‎attach/operator/e6aebe22-3205-4e43-b73f-d221a46b741b.png‎
3.01 KB
diff --git a/‎data_celery/formatify/tasks.py‎
Lines changed: 88 additions & 24 deletions b/‎data_celery/formatify/tasks.py‎
Lines changed: 88 additions & 24 deletions
diff --git a/‎data_celery/job/tasks.py‎
Lines changed: 1 addition & 1 deletion b/‎data_celery/job/tasks.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data_celery/main.py‎
Lines changed: 1 addition & 1 deletion b/‎data_celery/main.py‎
Lines changed: 1 addition & 1 deletion
@@ -43,6 +43,4 @@ RUN git config --global --add safe.directory '*'
 
 # Start fastapi API Server
 EXPOSE 8000
-# CMD ["df-server"]
-CMD ["uvicorn", "data_server.main:app", "--host", "0.0.0.0", "--port", "8000"]
 
@@ -34,11 +34,6 @@ This project inherits the [Apache License 2.0](LICENSE) from Data Juicer.
 docker build -t dataflow . -f Dockerfile
 ```
 
-## Building data-flow-celery from Source
-
-```
-docker build -t dataflow-celery . -f Dockerfile-celery
-```
 
 ## Prerequisites
 
@@ -80,6 +75,7 @@ docker run -d --name dataflow-redis \
 
 docker run -d --name dataflow-api -p 8000:8000 \
    -v /home/apidata:/data/dataflow_data \
+   -c "uvicorn data_server.main:app --host 0.0.0.0 --port 8000" \
    -e DATA_DIR=/data/dataflow_data \
    -e CSGHUB_ENDPOINT=https://hub.opencsg.com \
    -e MAX_WORKERS=99 \
@@ -107,6 +103,7 @@ docker run -d --name dataflow-api -p 8000:8000 \
 
 docker run -d --name celery-work -p 8001:8001 \
    -v /home/celery-data:/data/dataflow_celery \
+   -c "celery -A data_celery.main:celery_app worker --loglevel=info --pool=gevent" \
    -e DATA_DIR=/data/dataflow_celery \
    -e CSGHUB_ENDPOINT=https://hub.opencsg.com \
    -e MAX_WORKERS=99 \
@@ -146,14 +143,6 @@ uvicorn data_server.main:app --reload
 ## Run data-flow-celery server in development mode locally
 
 ```bash
-# Create virtual python 3.10 environment
-conda create -n  dataflow python=3.10
-
-# Install dependencies
-pip install '.[dist]' -i https://pypi.tuna.tsinghua.edu.cn/simple/
-pip install '.[tools]' -i https://pypi.tuna.tsinghua.edu.cn/simple/
-pip install '.[sci]' -i https://pypi.tuna.tsinghua.edu.cn/simple/
-pip install -r docker/requirements.txt
 
 # Run the celery server locally
 celery -A data_celery.main:celery_app worker --loglevel=info --pool=gevent
 
@@ -28,17 +28,19 @@
 from data_engine.utils.env import GetHubEndpoint
 @celery_app.task
 def format_task(task_id: int, user_name: str, user_token: str):
+
     tmp_path: str = None
     db_session: Session = None
     format_task: DataFormatTask = None
+
     try:
         db_session: Session = get_sync_session()
         format_task: DataFormatTask = FormatifyManager.get_formatify_task(db_session, task_id)
         tmp_path = get_format_folder_path(format_task.task_uid)
-        insert_formatity_task_log_info(format_task.task_uid, f"Create a temporary directory：{tmp_path}")
+        insert_formatity_task_log_info(format_task.task_uid, f"Create temporary directory：{tmp_path}")
         ensure_directory_exists(tmp_path)
 
-        insert_formatity_task_log_info(format_task.task_uid, f"Start downloading the source directory...")
+        insert_formatity_task_log_info(format_task.task_uid, f"Start downloading directory....")
         ingesterCSGHUB = load_ingester(
             dataset_path=tmp_path,
             repo_id=format_task.from_csg_hub_repo_id,
@@ -47,18 +49,25 @@ def format_task(task_id: int, user_name: str, user_token: str):
             user_token=user_token,
         )
         ingester_result = ingesterCSGHUB.ingest()
-        insert_formatity_task_log_info(format_task.task_uid, f"Download of the source directory completed... Directory address：{ingester_result}")
+        insert_formatity_task_log_info(format_task.task_uid, f"Download directory completed... Directory address：{ingester_result}")
         work_dir = Path(tmp_path).joinpath('work')
-        insert_formatity_task_log_info(format_task.task_uid, f"Start converting files...")
+        file_bool = search_files(tmp_path,[format_task.from_data_type])
+
+        if not file_bool:
+            insert_formatity_task_log_info(format_task.task_uid, f"file not found. task ended....")
+            format_task.task_status = DataFormatTaskStatusEnum.ERROR.value
+            db_session.commit()
+            return
+        insert_formatity_task_log_info(format_task.task_uid, f"Start converting file...")
 
         format_task_func(
             tmp_path=ingester_result,
             from_type=format_task.from_data_type,
             to_type=format_task.to_data_type,
             task_uid=format_task.task_uid,
         )
-        insert_formatity_task_log_info(format_task.task_uid, f"File conversion completed...")
-        insert_formatity_task_log_info(format_task.task_uid, f"Start uploading the target directory...")
+        insert_formatity_task_log_info(format_task.task_uid, f"Conversion file complete....")
+        insert_formatity_task_log_info(format_task.task_uid, f"Start uploading directory...")
 
         exporter = load_exporter(
             export_path=ingester_result,
@@ -78,13 +87,13 @@ def format_task(task_id: int, user_name: str, user_token: str):
         traceback.print_exc()
         format_task.task_status = DataFormatTaskStatusEnum.ERROR.value
         db_session.commit()
-        insert_formatity_task_log_error(format_task.task_uid, f"The conversion task failed.: {str(e)}")
+        insert_formatity_task_log_error(format_task.task_uid, f"Conversion task failed: {str(e)}")
     finally:
         pass
 
         if tmp_path:
             shutil.rmtree(tmp_path)
-            insert_formatity_task_log_info(format_task.task_uid, f"Delete the temporary directory：{tmp_path}")
+            insert_formatity_task_log_info(format_task.task_uid, f"Delete temporary directory：{tmp_path}")
 
 
 def format_task_func(
@@ -94,7 +103,7 @@ def format_task_func(
         task_uid: str
 ):
     insert_formatity_task_log_info(task_uid,
-                                   f"Convert directory：{tmp_path}，Source file type：{getFormatTypeName(from_type)}，Target file type：{getFormatTypeName(to_type)}")
+                                   f"Change the table of contents：{tmp_path}，Source file type：{getFormatTypeName(from_type)}，Source file type：{getFormatTypeName(to_type)}")
     match from_type:
         case DataFormatTypeEnum.Excel.value:
             match to_type:
@@ -131,28 +140,30 @@ def convert_excel_to_csv(file_path: str, task_uid):
             df = pd.read_excel(file_path)
             new_file = os.path.splitext(file_path)[0] + '.csv'
             df.to_csv(new_file, index=False)
-            insert_formatity_task_log_info(task_uid, f'The conversion of the file {new_file} was successful.')
+            insert_formatity_task_log_info(task_uid, f'convert file {new_file} succeed')
             os.remove(file_path)
             return True
         except Exception as e:
-            insert_formatity_task_log_error(task_uid, f"An error occurred while converting the file {file_path}: {e}")
+            print(f"convert file {file_path} error: {e}")
+            insert_formatity_task_log_error(task_uid, f"convert file {file_path} error: {e}")
             return False
     else:
         return True
 
 
 def convert_excel_to_json(file_path: str, task_uid):
     if file_path.lower().endswith(('.xlsx', '.xls')):
-        insert_formatity_task_log_info(task_uid, f'Source file address: {file_path}')
+        insert_formatity_task_log_info(task_uid, f'Source file address：{file_path}')
         try:
             df = pd.read_excel(file_path)
             new_file = os.path.splitext(file_path)[0] + '.json'
             df.to_json(new_file, orient='records', force_ascii=False)
-            insert_formatity_task_log_info(task_uid, f'The file {new_file} has been converted successfully.')
+            insert_formatity_task_log_info(task_uid, f'convert file {new_file} succeed')
             os.remove(file_path)
             return True
         except Exception as e:
-            insert_formatity_task_log_error(task_uid, f"When converting the file {file_path}, an error occurred: {e}")
+            print(f"convert file {file_path} error: {e}")
+            insert_formatity_task_log_error(task_uid, f"convert file {file_path} error: {e}")
 
             return False
     else:
@@ -162,24 +173,25 @@ def convert_excel_to_json(file_path: str, task_uid):
 
 def convert_excel_to_parquet(file_path: str, task_uid):
     if file_path.lower().endswith(('.xlsx', '.xls')):
-        insert_formatity_task_log_info(task_uid, f'Source file address: {file_path}')
+        insert_formatity_task_log_info(task_uid, f'Source file address：{file_path}')
         try:
             df = pd.read_excel(file_path)
             new_file = os.path.splitext(file_path)[0] + '.parquet'
             df.to_parquet(new_file + '.parquet', index=False)
-            insert_formatity_task_log_info(task_uid, f'The file {new_file} has been converted successfully.')
+            insert_formatity_task_log_info(task_uid, f'convert file {new_file} succeed')
             os.remove(file_path)
             return True
         except Exception as e:
-            insert_formatity_task_log_error(task_uid, f"When converting the file {file_path}, an error occurred: {e}")
+            print(f"convert file {file_path} error: {e}")
+            insert_formatity_task_log_error(task_uid, f"convert file {file_path} error: {e}")
             return False
     else:
         return True
 
 
 def convert_word_to_markdown(file_path: str, task_uid):
     if file_path.lower().endswith(('.docx', '.doc')):
-        insert_formatity_task_log_info(task_uid, f'Source file address: {file_path}')
+        insert_formatity_task_log_info(task_uid, f'Source file address：{file_path}')
         try:
             with open(file_path, "rb") as docx_file:
                 result = mammoth.convert_to_html(docx_file)
@@ -188,11 +200,12 @@ def convert_word_to_markdown(file_path: str, task_uid):
             markdown_file_path = os.path.splitext(file_path)[0] + '.md'
             with open(markdown_file_path, 'w', encoding='utf-8') as md_file:
                 md_file.write(markdown_content)
-            insert_formatity_task_log_info(task_uid, f'The file {markdown_file_path} has been converted successfully.')
+            insert_formatity_task_log_info(task_uid, f'convert file {markdown_file_path} succeed')
             os.remove(file_path)
             return True
         except Exception as e:
-            insert_formatity_task_log_error(task_uid, f"When converting the file {file_path}, an error occurred: {e}")
+            print(f"convert file {file_path} error: {e}")
+            insert_formatity_task_log_error(task_uid, f"convert file {file_path} error: {e}")
             return False
     else:
 
@@ -201,12 +214,12 @@ def convert_word_to_markdown(file_path: str, task_uid):
 
 def convert_ppt_to_markdown(file_path: str, task_uid):
     if file_path.lower().endswith(('.pptx', '.ppt')):
-        insert_formatity_task_log_info(task_uid, f'Source file address: {file_path}')
+        insert_formatity_task_log_info(task_uid, f'Source file address：{file_path}')
         try:
             prs = Presentation(file_path)
             markdown_content = ""
             for i, slide in enumerate(prs.slides):
-                markdown_content += f" PPT {i + 1}\n\n"
+                markdown_content += f" lantern slide {i + 1}\n\n"
                 for shape in slide.shapes:
                     if hasattr(shape, "text") and shape.text.strip():
                         text_content = shape.text.strip()
@@ -218,12 +231,63 @@ def convert_ppt_to_markdown(file_path: str, task_uid):
             markdown_file_path = os.path.splitext(file_path)[0] + '.md'
             with open(markdown_file_path, 'w', encoding='utf-8') as md_file:
                 md_file.write(markdown_content)
-            insert_formatity_task_log_info(task_uid, f'The file {markdown_file_path} has been converted successfully.')
+            insert_formatity_task_log_info(task_uid, f'convert file {markdown_file_path} succeed')
             os.remove(file_path)
             return True
         except Exception as e:
-            insert_formatity_task_log_error(task_uid, f"When converting the file {file_path}, an error occurred: {e}")
+            print(f"convert file {file_path} error: {e}")
+            insert_formatity_task_log_error(task_uid, f"convert file {file_path} error: {e}")
             return False
     else:
 
         return True
+
+
+from typing import List, Dict, Tuple
+
+def search_files(folder_path: str, types: List[int]) -> Tuple[bool, List[str]]:
+
+    type_map: Dict[int, List[str]] = {
+        0: ['.ppt', '.pptx'],  # PPT
+        1: ['.doc', '.docx'],  # Word
+        3: ['.xls', '.xlsx']  # Excel
+    }
+
+
+    target_extensions = set()
+    for file_type in types:
+        if file_type in type_map:
+            for ext in type_map[file_type]:
+                target_extensions.add(ext.lower())
+
+
+    found_files: List[str] = []
+
+    def traverse(current_path: str) -> None:
+
+        try:
+
+            entries = os.listdir(current_path)
+
+            for entry in entries:
+                entry_path = os.path.join(current_path, entry)
+
+                if os.path.isdir(entry_path):
+
+                    traverse(entry_path)
+                elif os.path.isfile(entry_path):
+
+                    file_ext = os.path.splitext(entry)[1].lower()
+                    if file_ext in target_extensions:
+                        found_files.append(entry_path)
+
+        except PermissionError:
+            print(f"No permission to access the folder: {current_path}")
+        except Exception as e:
+            print(f"Processing path {current_path} error: {str(e)}")
+
+
+    traverse(folder_path)
+
+
+    return bool(len(found_files) > 0)
@@ -37,7 +37,7 @@ def run_pipline_job(job_uuid,user_id, user_name, user_token):
             if job_obj is not None and job_obj.job_celery_uuid is not None and job_obj.job_celery_uuid != "":
                 job_celery_uuid = job_obj.job_celery_uuid
                 break
-            # time.sleep(1)
+            time.sleep(1)
         if job_celery_uuid == "":
             insert_pipline_job_run_task_log_error(job_uuid, f"not found job celery uuid : {job_uuid}")
             return False
 
@@ -197,7 +197,7 @@ def get_process_resource_usage_task(worker_name:str,current_ip:str):
 
 def get_process_resource_usage(redis_celery,job_uuid,process_id):
     try:
-        # print(1)
+
         process = psutil.Process(int(process_id))
         if process.is_running():
             cpu_usage = process.cpu_percent(interval=1)