|
13 | 13 | from ..common import (SPECIAL_CHARACTERS, get_words_from_document, |
14 | 14 | words_refinement) |
15 | 15 | from ..op_fusion import INTER_WORDS |
| 16 | +from loguru import logger |
| 17 | +import os |
| 18 | +from data_celery.mongo_tools.tools import ( |
| 19 | + insert_pipline_job_run_task_log_info, |
| 20 | + insert_pipline_job_run_task_log_warning, |
| 21 | + insert_pipline_job_run_task_log_error |
| 22 | +) |
16 | 23 |
|
17 | 24 | OP_NAME = 'flagged_words_filter' |
18 | 25 |
|
@@ -63,16 +70,58 @@ def __init__(self, |
63 | 70 | self.words_aug_join_char = words_aug_join_char |
64 | 71 | self.model_key = None |
65 | 72 |
|
| 73 | + # Log flagged_words_filter initialization |
| 74 | + msg = f"[flagged_words_filter] Initializing with lang='{lang}', tokenization={tokenization}" |
| 75 | + logger.info(msg) |
| 76 | + insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index) |
| 77 | + |
| 78 | + msg = f"[flagged_words_filter] flagged_words_dir: {flagged_words_dir}" |
| 79 | + logger.info(msg) |
| 80 | + insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index) |
| 81 | + |
| 82 | + # Check if flagged_words file exists before loading |
| 83 | + expected_file = os.path.join(flagged_words_dir, 'flagged_words.json') |
| 84 | + if os.path.exists(expected_file): |
| 85 | + msg = f"[flagged_words_filter] ✓ Found local flagged_words.json at: {expected_file}" |
| 86 | + logger.info(msg) |
| 87 | + insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index) |
| 88 | + else: |
| 89 | + msg = f"[flagged_words_filter] ✗ Local flagged_words.json NOT found at: {expected_file}, will attempt download" |
| 90 | + logger.warning(msg) |
| 91 | + insert_pipline_job_run_task_log_warning(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index) |
| 92 | + |
| 93 | + # Load flagged words |
| 94 | + msg = "[flagged_words_filter] Loading flagged words..." |
| 95 | + logger.info(msg) |
| 96 | + insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index) |
| 97 | + |
66 | 98 | self.FLAGGED_WORDS = load_words_asset(words_dir=flagged_words_dir, |
67 | 99 | words_type='flagged_words') |
| 100 | + |
| 101 | + total_words = sum(len(words) for words in self.FLAGGED_WORDS.values()) |
| 102 | + msg = f"[flagged_words_filter] ✓ Successfully loaded flagged_words: {len(self.FLAGGED_WORDS)} languages, {total_words} total words" |
| 103 | + logger.info(msg) |
| 104 | + insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index) |
68 | 105 |
|
69 | 106 | if 'all' not in self.FLAGGED_WORDS: |
70 | 107 | self.FLAGGED_WORDS['all'] = [ |
71 | 108 | val for vals in self.FLAGGED_WORDS.values() for val in vals |
72 | 109 | ] |
| 110 | + |
73 | 111 | if tokenization: |
| 112 | + msg = f"[flagged_words_filter] Tokenization enabled, preparing sentencepiece model for lang='{lang}'" |
| 113 | + logger.info(msg) |
| 114 | + insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index) |
| 115 | + |
| 116 | + msg = f"[flagged_words_filter] Expected model file: {lang}.sp.model" |
| 117 | + logger.info(msg) |
| 118 | + insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index) |
| 119 | + |
74 | 120 | self.model_key = prepare_model(model_type='sentencepiece', |
75 | 121 | lang=lang) |
| 122 | + msg = f"[flagged_words_filter] ✓ Successfully prepared sentencepiece model" |
| 123 | + logger.info(msg) |
| 124 | + insert_pipline_job_run_task_log_info(self.job_uid, msg, operator_name=OP_NAME, operator_index=self.pipline_index) |
76 | 125 |
|
77 | 126 | def compute_stats(self, sample, context=False): |
78 | 127 | # check if it's computed already |
|
0 commit comments