@@ -73,27 +73,27 @@ def load_cosyvoice3_model(model_dir: str):
7373 """Load CosyVoice3 model for weight extraction."""
7474 from hyperpyyaml import load_hyperpyyaml
7575 from cosyvoice .utils .class_utils import get_model_type
76-
76+
7777 hyper_yaml_path = os .path .join (model_dir , 'cosyvoice3.yaml' )
7878 hf_llm_dir = os .path .join (model_dir , 'CosyVoice-BlankEN' )
79-
79+
8080 if not os .path .exists (hyper_yaml_path ):
8181 raise ValueError (f'{ hyper_yaml_path } not found!' )
82-
82+
8383 with open (hyper_yaml_path , 'r' ) as f :
8484 configs = load_hyperpyyaml (
85- f ,
85+ f ,
8686 overrides = {'qwen_pretrain_path' : hf_llm_dir }
8787 )
88-
88+
8989 # Load LLM only
9090 llm = configs ['llm' ]
9191 llm_weights_path = os .path .join (model_dir , 'llm.pt' )
9292 llm .load_state_dict (torch .load (llm_weights_path , map_location = 'cpu' ), strict = True )
9393 llm .eval ()
94-
94+
9595 logger .info (f"Loaded CosyVoice3 LLM from { model_dir } " )
96-
96+
9797 return llm , hf_llm_dir , configs
9898
9999
@@ -125,23 +125,23 @@ def convert_cosyvoice3_to_hf(
125125 dtype: Data type for saving
126126 """
127127 logger .info (f"Loading CosyVoice3 model from { model_dir } " )
128-
128+
129129 # 1. Load CosyVoice3 components
130130 cosyvoice3_llm , hf_llm_dir , configs = load_cosyvoice3_model (model_dir )
131-
131+
132132 # Extract key components
133133 qwen_model = cosyvoice3_llm .llm .model # Qwen2ForCausalLM
134134 speech_embedding = cosyvoice3_llm .speech_embedding # Embedding for speech tokens
135135 llm_decoder = cosyvoice3_llm .llm_decoder # Linear for decoding to speech tokens
136-
136+
137137 speech_token_size = get_speech_token_size (cosyvoice3_llm )
138138 logger .info (f"Speech token size: { speech_token_size } " )
139-
139+
140140 # 2. Load tokenizer and add CosyVoice3 text special tokens + speech tokens
141141 tokenizer = AutoTokenizer .from_pretrained (hf_llm_dir , trust_remote_code = True )
142142 base_vocab_size = len (tokenizer )
143143 logger .info (f"Base tokenizer vocab size: { base_vocab_size } " )
144-
144+
145145 # IMPORTANT:
146146 # - In CosyVoice3, LLM speech special tokens (sos/eos/task_id/fill) are INSIDE speech_embedding,
147147 # i.e. represented as <|s_6561|>, <|s_6562|>, <|s_6563|>, <|s_6564|>.
@@ -185,7 +185,7 @@ def convert_cosyvoice3_to_hf(
185185 tokenizer .add_special_tokens (special_tokens )
186186 text_vocab_size = len (tokenizer )
187187 logger .info (f"Tokenizer vocab after CosyVoice3 text special tokens: { text_vocab_size } " )
188-
188+
189189 # Add speech tokens: <|s_0|>, <|s_1|>, ..., <|s_{embedding_size-1}|>
190190 # IMPORTANT: This range must match speech_embedding.num_embeddings (includes speech special tokens).
191191 actual_speech_tokens = speech_token_size # Full embedding size (with speech special tokens)
@@ -204,37 +204,37 @@ def convert_cosyvoice3_to_hf(
204204 assert "<s_6563>" not in speech_tokens
205205 assert "<s_6564>" not in speech_tokens
206206 tokenizer .add_tokens (speech_tokens )
207-
207+
208208 new_vocab_size = len (tokenizer )
209209 logger .info (f"New tokenizer vocab size: { new_vocab_size } " )
210210 logger .info (f"Added { new_vocab_size - base_vocab_size } tokens total (text special + speech tokens)" )
211-
211+
212212 # 3. Resize embeddings in Qwen model
213213 # Align to 128 for TensorRT efficiency
214214 padded_vocab_size = ((new_vocab_size + 127 ) // 128 ) * 128
215215 qwen_model .resize_token_embeddings (padded_vocab_size )
216216 logger .info (f"Resized embeddings to: { padded_vocab_size } " )
217-
217+
218218 # Speech tokens start after text vocab (base + CosyVoice3 text special tokens)
219219 speech_token_offset = text_vocab_size
220220
221221 # 4. Copy speech_embedding into extended embed_tokens
222222 input_embeddings = qwen_model .get_input_embeddings ()
223223 hidden_size = input_embeddings .weight .shape [1 ]
224-
224+
225225 logger .info (f"Hidden size: { hidden_size } " )
226226 logger .info (f"speech_embedding shape: { speech_embedding .weight .shape } " )
227227 logger .info (f"llm_decoder shape: { llm_decoder .weight .shape } " )
228-
228+
229229 with torch .no_grad ():
230230 # Copy speech_embedding weights into embed_tokens
231231 # Indices: [speech_token_offset, speech_token_offset + speech_token_size)
232232 src_size = min (speech_embedding .weight .shape [0 ], actual_speech_tokens )
233233 input_embeddings .weight [speech_token_offset :speech_token_offset + src_size ] = \
234234 speech_embedding .weight [:src_size ].to (input_embeddings .weight .dtype )
235-
235+
236236 logger .info (f"Copied speech_embedding to embed_tokens[{ speech_token_offset } :{ speech_token_offset + src_size } ]" )
237-
237+
238238 # 5. Create new lm_head with extended vocab and copy llm_decoder
239239 # Original lm_head: hidden_size -> original_vocab_size
240240 # New lm_head: hidden_size -> padded_vocab_size
@@ -247,7 +247,7 @@ def convert_cosyvoice3_to_hf(
247247 out_features = padded_vocab_size ,
248248 bias = has_bias
249249 )
250-
250+
251251 with torch .no_grad ():
252252 # Initialize weights:
253253 # - Text part: copy from original lm_head (or zeros)
@@ -258,42 +258,42 @@ def convert_cosyvoice3_to_hf(
258258 new_lm_head .weight .data .zero_ ()
259259 if has_bias :
260260 new_lm_head .bias .data .fill_ (- float ('inf' ))
261-
261+
262262 # Copy original lm_head for text tokens (optional)
263263 original_lm_head = qwen_model .lm_head
264264 if original_lm_head is not None and original_lm_head .weight .shape [0 ] >= text_vocab_size :
265265 new_lm_head .weight [:text_vocab_size ] = original_lm_head .weight [:text_vocab_size ]
266266 if has_bias and original_lm_head .bias is not None :
267267 new_lm_head .bias [:text_vocab_size ] = original_lm_head .bias [:text_vocab_size ]
268-
268+
269269 # Copy llm_decoder for speech tokens
270270 decoder_size = min (llm_decoder .weight .shape [0 ], actual_speech_tokens )
271271 new_lm_head .weight [speech_token_offset :speech_token_offset + decoder_size ] = \
272272 llm_decoder .weight [:decoder_size ].to (new_lm_head .weight .dtype )
273-
273+
274274 if has_bias :
275275 new_lm_head .bias [speech_token_offset :speech_token_offset + decoder_size ] = \
276276 llm_decoder .bias [:decoder_size ].to (new_lm_head .bias .dtype )
277277 else :
278278 # If llm_decoder has no bias but we want it for text tokens
279279 pass
280-
280+
281281 # Replace lm_head
282282 qwen_model .lm_head = new_lm_head
283-
283+
284284 logger .info (f"Created new lm_head with shape: { new_lm_head .weight .shape } " )
285285 logger .info (f"Copied llm_decoder to lm_head[{ speech_token_offset } :{ speech_token_offset + decoder_size } ]" )
286-
286+
287287 # 6. Update model configuration
288288 qwen_model .config .vocab_size = padded_vocab_size
289289 qwen_model .config .tie_word_embeddings = False # Embeddings and lm_head are now different!
290-
290+
291291 # Set EOS token for generation (speech EOS lives inside speech_embedding as <|s_{base_speech_token_size+1}|>)
292292 base_speech_token_size = getattr (cosyvoice3_llm , "speech_token_size" , 6561 )
293293 eos_speech_idx = base_speech_token_size + 1
294294 eos_id = speech_token_offset + eos_speech_idx
295295 qwen_model .config .eos_token_id = eos_id
296-
296+
297297 # Generation settings
298298 qwen_model .generation_config .eos_token_id = eos_id
299299 qwen_model .generation_config .pad_token_id = eos_id
@@ -302,7 +302,7 @@ def convert_cosyvoice3_to_hf(
302302 qwen_model .generation_config .top_k = 25
303303 qwen_model .generation_config .repetition_penalty = 1.1
304304 qwen_model .generation_config .max_new_tokens = 2048
305-
305+
306306 # 7. Convert to target dtype
307307 dtype_map = {
308308 "float16" : torch .float16 ,
@@ -311,16 +311,16 @@ def convert_cosyvoice3_to_hf(
311311 }
312312 target_dtype = dtype_map [dtype ]
313313 qwen_model .to (target_dtype )
314-
314+
315315 # 8. Save model and tokenizer
316316 os .makedirs (output_dir , exist_ok = True )
317-
317+
318318 qwen_model .save_pretrained (output_dir )
319-
319+
320320 TEMPLATE = "{%- for message in messages %}{%- if message['role'] == 'user' %}{{- '<|sos|>' + message['content'] + '<|task_id|>' }}{%- elif message['role'] == 'assistant' %}{{- message['content']}}{%- endif %}{%- endfor %}"
321321 tokenizer .chat_template = TEMPLATE
322322 tokenizer .save_pretrained (output_dir )
323-
323+
324324 # Save metadata for TRT-LLM inference
325325 metadata = {
326326 "original_vocab_size" : base_vocab_size ,
@@ -332,30 +332,30 @@ def convert_cosyvoice3_to_hf(
332332 "speech_token_offset" : speech_token_offset ,
333333 "dtype" : dtype ,
334334 }
335-
335+
336336 import json
337337 with open (os .path .join (output_dir , "cosyvoice3_metadata.json" ), "w" ) as f :
338338 json .dump (metadata , f , indent = 2 )
339-
339+
340340 logger .info (f"Saved HuggingFace model to { output_dir } " )
341341 logger .info (f"Metadata: { metadata } " )
342-
342+
343343 return output_dir , metadata
344344
345345
346346def main ():
347347 args = parse_args ()
348-
348+
349349 output_dir = args .output_dir
350350 if output_dir is None :
351351 output_dir = os .path .join (args .model_dir , "hf_merged" )
352-
352+
353353 convert_cosyvoice3_to_hf (
354354 model_dir = args .model_dir ,
355355 output_dir = output_dir ,
356356 dtype = args .dtype ,
357357 )
358-
358+
359359 print ("\n " + "=" * 70 )
360360 print ("✅ Conversion complete!" )
361361 print ("=" * 70 )
0 commit comments