Download the required files for the flagged_words_filter during the build process.

JingTY · HaiHui886 · commit d68cc42a8247 · 2025-12-04T11:27:25.000Z
diff --git a/Dockerfile b/Dockerfile
@@ -54,5 +54,26 @@ RUN git config --global user.email "dataflow@opencsg.com" && \
     git config --global user.name "dataflow" && \
     git config --global --add safe.directory '*'
 
+# Download required resources for offline deployment
+# Create default cache directories
+RUN mkdir -p /root/.cache/data_engine/assets && \
+    mkdir -p /root/.cache/data_engine/models
+
+# Download JSON resources (flagged_words and stopwords)
+RUN wget -O /root/.cache/data_engine/assets/flagged_words.json \
+    https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/flagged_words.json && \
+    wget -O /root/.cache/data_engine/assets/stopwords.json \
+    https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/stopwords.json
+
+# Download SentencePiece models (Chinese and English)
+RUN wget -O /root/.cache/data_engine/models/zh.sp.model \
+    https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/models/zh.sp.model && \
+    wget -O /root/.cache/data_engine/models/en.sp.model \
+    https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/models/en.sp.model
+
+# Verify downloaded files
+RUN ls -lh /root/.cache/data_engine/assets/ && \
+    ls -lh /root/.cache/data_engine/models/
+
 # Start fastapi API Server
 EXPOSE 8000
diff --git a/data_engine/ops/filter/flagged_words_filter.py b/data_engine/ops/filter/flagged_words_filter.py
@@ -13,6 +13,13 @@
 from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
                       words_refinement)
 from ..op_fusion import INTER_WORDS
+from loguru import logger
+import os
+from data_celery.mongo_tools.tools import (
+    insert_pipline_job_run_task_log_info,
+    insert_pipline_job_run_task_log_warning,
+    insert_pipline_job_run_task_log_error
+)
 
 OP_NAME = 'flagged_words_filter'
 
@@ -63,16 +70,58 @@ def __init__(self,
         self.words_aug_join_char = words_aug_join_char
         self.model_key = None
 
+        # Log flagged_words_filter initialization
+        msg = f"[flagged_words_filter] Initializing with lang='{lang}', tokenization={tokenization}"
+        logger.info(msg)
+        insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index)
+        
+        msg = f"[flagged_words_filter] flagged_words_dir: {flagged_words_dir}"
+        logger.info(msg)
+        insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index)
+        
+        # Check if flagged_words file exists before loading
+        expected_file = os.path.join(flagged_words_dir, 'flagged_words.json')
+        if os.path.exists(expected_file):
+            msg = f"[flagged_words_filter] ✓ Found local flagged_words.json at: {expected_file}"
+            logger.info(msg)
+            insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index)
+        else:
+            msg = f"[flagged_words_filter] ✗ Local flagged_words.json NOT found at: {expected_file}, will attempt download"
+            logger.warning(msg)
+            insert_pipline_job_run_task_log_warning(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index)
+        
+        # Load flagged words
+        msg = "[flagged_words_filter] Loading flagged words..."
+        logger.info(msg)
+        insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index)
+        
         self.FLAGGED_WORDS = load_words_asset(words_dir=flagged_words_dir,
                                               words_type='flagged_words')
+        
+        total_words = sum(len(words) for words in self.FLAGGED_WORDS.values())
+        msg = f"[flagged_words_filter] ✓ Successfully loaded flagged_words: {len(self.FLAGGED_WORDS)} languages, {total_words} total words"
+        logger.info(msg)
+        insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index)
 
         if 'all' not in self.FLAGGED_WORDS:
             self.FLAGGED_WORDS['all'] = [
                 val for vals in self.FLAGGED_WORDS.values() for val in vals
             ]
+        
         if tokenization:
+            msg = f"[flagged_words_filter] Tokenization enabled, preparing sentencepiece model for lang='{lang}'"
+            logger.info(msg)
+            insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index)
+            
+            msg = f"[flagged_words_filter] Expected model file: {lang}.sp.model"
+            logger.info(msg)
+            insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index)
+            
             self.model_key = prepare_model(model_type='sentencepiece',
                                            lang=lang)
+            msg = f"[flagged_words_filter] ✓ Successfully prepared sentencepiece model"
+            logger.info(msg)
+            insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index)
 
     def compute_stats(self, sample, context=False):
         # check if it's computed already
diff --git a/data_engine/utils/asset_utils.py b/data_engine/utils/asset_utils.py
@@ -33,26 +33,35 @@ def load_words_asset(words_dir: str, words_type: str):
     words_dict = {}
     os.makedirs(words_dir, exist_ok=True)
 
+    logger.info(f'[load_words_asset] Searching for {words_type} in directory: {words_dir}')
+    
     # try to load words from `words_type` file
     for filename in os.listdir(words_dir):
         if filename.endswith('.json') and words_type in filename:
-            with open(os.path.join(words_dir, filename), 'r') as file:
+            file_path = os.path.join(words_dir, filename)
+            logger.info(f'[load_words_asset] ✓ Found local file: {file_path}')
+            with open(file_path, 'r') as file:
                 loaded_words = json.load(file)
                 for key in loaded_words:
                     if key in words_dict:
                         words_dict[key] += loaded_words[key]
                     else:
                         words_dict[key] = loaded_words[key]
+            logger.info(f'[load_words_asset] ✓ Successfully loaded from local file (no network access)')
+    
     # if the asset file is not found, then download it from ASSET_LINKS
     if not bool(words_dict):
-        logger.info(f'Specified {words_dir} does not contain '
-                    f'any {words_type} files in json format, now '
-                    'download the one cached by data_engine team')
-        response = requests.get(ASSET_LINKS[words_type])
+        download_url = ASSET_LINKS[words_type]
+        logger.warning(f'[load_words_asset] ✗ Local file NOT found in {words_dir}')
+        logger.info(f'[load_words_asset] ⬇ Attempting to download from: {download_url}')
+        
+        response = requests.get(download_url)
         words_dict = response.json()
+        
         # cache the asset file locally
         cache_path = os.path.join(words_dir, f'{words_type}.json')
         with open(cache_path, 'w') as file:
             json.dump(words_dict, file)
+        logger.info(f'[load_words_asset] ✓ Downloaded and cached to: {cache_path}')
 
     return words_dict
diff --git a/data_engine/utils/model_utils.py b/data_engine/utils/model_utils.py
@@ -65,38 +65,53 @@ def check_model(model_name, force=False):
         the model file maybe incomplete for some reason, so need to
         download again forcefully.
     """
+    logger.info(f'[check_model] Checking model: {model_name}, force={force}')
+    
     # check for local model
     if os.path.exists(model_name):
+        logger.info(f'[check_model] ✓ Found model at absolute path: {model_name}')
         return model_name
 
     if not os.path.exists(DJMC):
+        logger.info(f'[check_model] Creating models cache directory: {DJMC}')
         os.makedirs(DJMC)
 
     # check if the specified model exists. If it does not exist, download it
     cached_model_path = os.path.join(DJMC, model_name)
+    logger.info(f'[check_model] Expected model path: {cached_model_path}')
+    
+    if os.path.exists(cached_model_path):
+        logger.info(f'[check_model] ✓ Found cached model (no network access needed)')
+        if not force:
+            return cached_model_path
+    
     if force:
         if os.path.exists(cached_model_path):
             os.remove(cached_model_path)
-            logger.info(
-                f'Model [{cached_model_path}] invalid, force to downloading...'
+            logger.warning(
+                f'[check_model] Model [{cached_model_path}] marked invalid, force downloading...'
             )
         else:
-            logger.info(
-                f'Model [{cached_model_path}] not found. Downloading...')
+            logger.warning(
+                f'[check_model] ✗ Model [{cached_model_path}] not found. Attempting download...')
 
         try:
             model_link = os.path.join(MODEL_LINKS, model_name)
+            logger.info(f'[check_model] ⬇ Downloading from primary link: {model_link}')
             wget.download(model_link, cached_model_path, bar=None)
+            logger.info(f'[check_model] ✓ Successfully downloaded to: {cached_model_path}')
         except:  # noqa: E722
             try:
                 backup_model_link = os.path.join(
                     get_backup_model_link(model_name), model_name)
+                logger.warning(f'[check_model] Primary download failed, trying backup: {backup_model_link}')
                 wget.download(backup_model_link, cached_model_path, bar=None)
+                logger.info(f'[check_model] ✓ Successfully downloaded from backup')
             except:  # noqa: E722
                 logger.error(
-                    f'Downloading model [{model_name}] error. '
-                    f'Please retry later or download it into {DJMC} '
-                    f'manually from {model_link} or {backup_model_link} ')
+                    f'[check_model] ✗ Download failed for [{model_name}]. '
+                    f'Please download it manually into {DJMC} '
+                    f'from {model_link} or {backup_model_link} ')
                 exit(1)
     return cached_model_path
 
@@ -127,12 +142,18 @@ def prepare_sentencepiece_model(model_path):
     """
     import sentencepiece
 
-    logger.info('Loading sentencepiece model...')
+    logger.info(f'[prepare_sentencepiece_model] Preparing sentencepiece model: {model_path}')
     sentencepiece_model = sentencepiece.SentencePieceProcessor()
     try:
-        sentencepiece_model.load(check_model(model_path))
+        model_file = check_model(model_path)
+        logger.info(f'[prepare_sentencepiece_model] Loading model from: {model_file}')
+        sentencepiece_model.load(model_file)
+        logger.info(f'[prepare_sentencepiece_model] ✓ Successfully loaded sentencepiece model (no download needed)')
     except:  # noqa: E722
-        sentencepiece_model.load(check_model(model_path, force=True))
+        logger.warning(f'[prepare_sentencepiece_model] First load attempt failed, retrying with force=True...')
+        model_file = check_model(model_path, force=True)
+        sentencepiece_model.load(model_file)
+        logger.info(f'[prepare_sentencepiece_model] ✓ Successfully loaded sentencepiece model after download')
     return sentencepiece_model