Skip to content

Commit 914454e

Browse files
committed
fix lint
1 parent f4d376e commit 914454e

4 files changed

Lines changed: 43 additions & 43 deletions

File tree

cosyvoice/dataset/processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ def compute_fbank(data,
181181

182182

183183
def compute_whisper_fbank(data, num_frames=-1, mode='train'):
184-
""" Extract whisper fbank
184+
""" Extract whisper fbank
185185
186186
Args:
187187
data: Iterable[{key, wav, label, sample_rate}]

cosyvoice/vllm/cosyvoice2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def compute_logits(
9999
sampling_metadata: Optional[SamplingMetadata] = None,
100100
) -> Optional[torch.Tensor]:
101101
if VLLM_V1_ENGINE_ONLY:
102-
logits = self.logits_processor(self.lm_head, hidden_states,
102+
logits = self.logits_processor(self.lm_head, hidden_states,
103103
self.lm_head.bias)
104104
else:
105105
logits = self.logits_processor(self.lm_head, hidden_states,

example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def cosyvoice3_example():
9999
# 歴史的世界においては、過去は単に過ぎ去ったものではない、プラトンのいう如く非有が有である。 -> レキシ テキ セカイ ニ オイ テ ワ、カコ ワ タンニ スギサッ タ モノ デ ワ ナイ、プラトン ノ イウ ゴトク ヒ ユー ガ ユー デ アル。
100100
for i, j in enumerate(cosyvoice.inference_cross_lingual('You are a helpful assistant.<|endofprompt|>レキシ テキ セカイ ニ オイ テ ワ、カコ ワ タンニ スギサッ タ モノ デ ワ ナイ、プラトン ノ イウ ゴトク ヒ ユー ガ ユー デ アル。',
101101
'./asset/zero_shot_prompt.wav', stream=False)):
102-
torchaudio.save('japanese_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
102+
torchaudio.save('japanese_{}.wav'.format(i), j['tts_speech'], cosyvoice.sample_rate)
103103

104104

105105
def main():

runtime/triton_trtllm/scripts/convert_cosyvoice3_to_hf.py

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -73,27 +73,27 @@ def load_cosyvoice3_model(model_dir: str):
7373
"""Load CosyVoice3 model for weight extraction."""
7474
from hyperpyyaml import load_hyperpyyaml
7575
from cosyvoice.utils.class_utils import get_model_type
76-
76+
7777
hyper_yaml_path = os.path.join(model_dir, 'cosyvoice3.yaml')
7878
hf_llm_dir = os.path.join(model_dir, 'CosyVoice-BlankEN')
79-
79+
8080
if not os.path.exists(hyper_yaml_path):
8181
raise ValueError(f'{hyper_yaml_path} not found!')
82-
82+
8383
with open(hyper_yaml_path, 'r') as f:
8484
configs = load_hyperpyyaml(
85-
f,
85+
f,
8686
overrides={'qwen_pretrain_path': hf_llm_dir}
8787
)
88-
88+
8989
# Load LLM only
9090
llm = configs['llm']
9191
llm_weights_path = os.path.join(model_dir, 'llm.pt')
9292
llm.load_state_dict(torch.load(llm_weights_path, map_location='cpu'), strict=True)
9393
llm.eval()
94-
94+
9595
logger.info(f"Loaded CosyVoice3 LLM from {model_dir}")
96-
96+
9797
return llm, hf_llm_dir, configs
9898

9999

@@ -125,23 +125,23 @@ def convert_cosyvoice3_to_hf(
125125
dtype: Data type for saving
126126
"""
127127
logger.info(f"Loading CosyVoice3 model from {model_dir}")
128-
128+
129129
# 1. Load CosyVoice3 components
130130
cosyvoice3_llm, hf_llm_dir, configs = load_cosyvoice3_model(model_dir)
131-
131+
132132
# Extract key components
133133
qwen_model = cosyvoice3_llm.llm.model # Qwen2ForCausalLM
134134
speech_embedding = cosyvoice3_llm.speech_embedding # Embedding for speech tokens
135135
llm_decoder = cosyvoice3_llm.llm_decoder # Linear for decoding to speech tokens
136-
136+
137137
speech_token_size = get_speech_token_size(cosyvoice3_llm)
138138
logger.info(f"Speech token size: {speech_token_size}")
139-
139+
140140
# 2. Load tokenizer and add CosyVoice3 text special tokens + speech tokens
141141
tokenizer = AutoTokenizer.from_pretrained(hf_llm_dir, trust_remote_code=True)
142142
base_vocab_size = len(tokenizer)
143143
logger.info(f"Base tokenizer vocab size: {base_vocab_size}")
144-
144+
145145
# IMPORTANT:
146146
# - In CosyVoice3, LLM speech special tokens (sos/eos/task_id/fill) are INSIDE speech_embedding,
147147
# i.e. represented as <|s_6561|>, <|s_6562|>, <|s_6563|>, <|s_6564|>.
@@ -185,7 +185,7 @@ def convert_cosyvoice3_to_hf(
185185
tokenizer.add_special_tokens(special_tokens)
186186
text_vocab_size = len(tokenizer)
187187
logger.info(f"Tokenizer vocab after CosyVoice3 text special tokens: {text_vocab_size}")
188-
188+
189189
# Add speech tokens: <|s_0|>, <|s_1|>, ..., <|s_{embedding_size-1}|>
190190
# IMPORTANT: This range must match speech_embedding.num_embeddings (includes speech special tokens).
191191
actual_speech_tokens = speech_token_size # Full embedding size (with speech special tokens)
@@ -204,37 +204,37 @@ def convert_cosyvoice3_to_hf(
204204
assert "<s_6563>" not in speech_tokens
205205
assert "<s_6564>" not in speech_tokens
206206
tokenizer.add_tokens(speech_tokens)
207-
207+
208208
new_vocab_size = len(tokenizer)
209209
logger.info(f"New tokenizer vocab size: {new_vocab_size}")
210210
logger.info(f"Added {new_vocab_size - base_vocab_size} tokens total (text special + speech tokens)")
211-
211+
212212
# 3. Resize embeddings in Qwen model
213213
# Align to 128 for TensorRT efficiency
214214
padded_vocab_size = ((new_vocab_size + 127) // 128) * 128
215215
qwen_model.resize_token_embeddings(padded_vocab_size)
216216
logger.info(f"Resized embeddings to: {padded_vocab_size}")
217-
217+
218218
# Speech tokens start after text vocab (base + CosyVoice3 text special tokens)
219219
speech_token_offset = text_vocab_size
220220

221221
# 4. Copy speech_embedding into extended embed_tokens
222222
input_embeddings = qwen_model.get_input_embeddings()
223223
hidden_size = input_embeddings.weight.shape[1]
224-
224+
225225
logger.info(f"Hidden size: {hidden_size}")
226226
logger.info(f"speech_embedding shape: {speech_embedding.weight.shape}")
227227
logger.info(f"llm_decoder shape: {llm_decoder.weight.shape}")
228-
228+
229229
with torch.no_grad():
230230
# Copy speech_embedding weights into embed_tokens
231231
# Indices: [speech_token_offset, speech_token_offset + speech_token_size)
232232
src_size = min(speech_embedding.weight.shape[0], actual_speech_tokens)
233233
input_embeddings.weight[speech_token_offset:speech_token_offset + src_size] = \
234234
speech_embedding.weight[:src_size].to(input_embeddings.weight.dtype)
235-
235+
236236
logger.info(f"Copied speech_embedding to embed_tokens[{speech_token_offset}:{speech_token_offset + src_size}]")
237-
237+
238238
# 5. Create new lm_head with extended vocab and copy llm_decoder
239239
# Original lm_head: hidden_size -> original_vocab_size
240240
# New lm_head: hidden_size -> padded_vocab_size
@@ -247,7 +247,7 @@ def convert_cosyvoice3_to_hf(
247247
out_features=padded_vocab_size,
248248
bias=has_bias
249249
)
250-
250+
251251
with torch.no_grad():
252252
# Initialize weights:
253253
# - Text part: copy from original lm_head (or zeros)
@@ -258,42 +258,42 @@ def convert_cosyvoice3_to_hf(
258258
new_lm_head.weight.data.zero_()
259259
if has_bias:
260260
new_lm_head.bias.data.fill_(-float('inf'))
261-
261+
262262
# Copy original lm_head for text tokens (optional)
263263
original_lm_head = qwen_model.lm_head
264264
if original_lm_head is not None and original_lm_head.weight.shape[0] >= text_vocab_size:
265265
new_lm_head.weight[:text_vocab_size] = original_lm_head.weight[:text_vocab_size]
266266
if has_bias and original_lm_head.bias is not None:
267267
new_lm_head.bias[:text_vocab_size] = original_lm_head.bias[:text_vocab_size]
268-
268+
269269
# Copy llm_decoder for speech tokens
270270
decoder_size = min(llm_decoder.weight.shape[0], actual_speech_tokens)
271271
new_lm_head.weight[speech_token_offset:speech_token_offset + decoder_size] = \
272272
llm_decoder.weight[:decoder_size].to(new_lm_head.weight.dtype)
273-
273+
274274
if has_bias:
275275
new_lm_head.bias[speech_token_offset:speech_token_offset + decoder_size] = \
276276
llm_decoder.bias[:decoder_size].to(new_lm_head.bias.dtype)
277277
else:
278278
# If llm_decoder has no bias but we want it for text tokens
279279
pass
280-
280+
281281
# Replace lm_head
282282
qwen_model.lm_head = new_lm_head
283-
283+
284284
logger.info(f"Created new lm_head with shape: {new_lm_head.weight.shape}")
285285
logger.info(f"Copied llm_decoder to lm_head[{speech_token_offset}:{speech_token_offset + decoder_size}]")
286-
286+
287287
# 6. Update model configuration
288288
qwen_model.config.vocab_size = padded_vocab_size
289289
qwen_model.config.tie_word_embeddings = False # Embeddings and lm_head are now different!
290-
290+
291291
# Set EOS token for generation (speech EOS lives inside speech_embedding as <|s_{base_speech_token_size+1}|>)
292292
base_speech_token_size = getattr(cosyvoice3_llm, "speech_token_size", 6561)
293293
eos_speech_idx = base_speech_token_size + 1
294294
eos_id = speech_token_offset + eos_speech_idx
295295
qwen_model.config.eos_token_id = eos_id
296-
296+
297297
# Generation settings
298298
qwen_model.generation_config.eos_token_id = eos_id
299299
qwen_model.generation_config.pad_token_id = eos_id
@@ -302,7 +302,7 @@ def convert_cosyvoice3_to_hf(
302302
qwen_model.generation_config.top_k = 25
303303
qwen_model.generation_config.repetition_penalty = 1.1
304304
qwen_model.generation_config.max_new_tokens = 2048
305-
305+
306306
# 7. Convert to target dtype
307307
dtype_map = {
308308
"float16": torch.float16,
@@ -311,16 +311,16 @@ def convert_cosyvoice3_to_hf(
311311
}
312312
target_dtype = dtype_map[dtype]
313313
qwen_model.to(target_dtype)
314-
314+
315315
# 8. Save model and tokenizer
316316
os.makedirs(output_dir, exist_ok=True)
317-
317+
318318
qwen_model.save_pretrained(output_dir)
319-
319+
320320
TEMPLATE = "{%- for message in messages %}{%- if message['role'] == 'user' %}{{- '<|sos|>' + message['content'] + '<|task_id|>' }}{%- elif message['role'] == 'assistant' %}{{- message['content']}}{%- endif %}{%- endfor %}"
321321
tokenizer.chat_template = TEMPLATE
322322
tokenizer.save_pretrained(output_dir)
323-
323+
324324
# Save metadata for TRT-LLM inference
325325
metadata = {
326326
"original_vocab_size": base_vocab_size,
@@ -332,30 +332,30 @@ def convert_cosyvoice3_to_hf(
332332
"speech_token_offset": speech_token_offset,
333333
"dtype": dtype,
334334
}
335-
335+
336336
import json
337337
with open(os.path.join(output_dir, "cosyvoice3_metadata.json"), "w") as f:
338338
json.dump(metadata, f, indent=2)
339-
339+
340340
logger.info(f"Saved HuggingFace model to {output_dir}")
341341
logger.info(f"Metadata: {metadata}")
342-
342+
343343
return output_dir, metadata
344344

345345

346346
def main():
347347
args = parse_args()
348-
348+
349349
output_dir = args.output_dir
350350
if output_dir is None:
351351
output_dir = os.path.join(args.model_dir, "hf_merged")
352-
352+
353353
convert_cosyvoice3_to_hf(
354354
model_dir=args.model_dir,
355355
output_dir=output_dir,
356356
dtype=args.dtype,
357357
)
358-
358+
359359
print("\n" + "=" * 70)
360360
print("✅ Conversion complete!")
361361
print("=" * 70)

0 commit comments

Comments
 (0)