@@ -10,44 +10,38 @@ export PYTHONPATH=${cosyvoice_path}/third_party/Matcha-TTS:$PYTHONPATH
1010stage=$1
1111stop_stage=$2
1212
13- huggingface_model_local_dir=. /hf_cosyvoice3_llm
14- model_scope_model_local_dir=/workspace_yuekai/HF /Fun-CosyVoice3-0.5B-2512
13+ huggingface_llm_local_dir= $cosyvoice_path /runtime/triton_trtllm /hf_cosyvoice3_llm
14+ cosyvoice3_official_model_dir= $cosyvoice_path /runtime/triton_trtllm /Fun-CosyVoice3-0.5B-2512
1515
1616trt_dtype=bfloat16
17- trt_weights_dir=. /trt_weights_${trt_dtype}
18- trt_engines_dir=. /trt_engines_${trt_dtype}
17+ trt_weights_dir=$cosyvoice_path /runtime/triton_trtllm /trt_weights_${trt_dtype}
18+ trt_engines_dir=$cosyvoice_path /runtime/triton_trtllm /trt_engines_${trt_dtype}
1919
20- model_repo_src=. /model_repo_cosyvoice3
21- model_repo=./deploy_cosyvoice3
22- bls_instance_num=1
20+ model_repo_src=$cosyvoice_path /runtime/triton_trtllm /model_repo_cosyvoice3
21+ model_repo=$cosyvoice_path /runtime/triton_trtllm/model_repo_cosyvoice3_copy
22+ bls_instance_num=10
2323
2424if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
2525
2626 echo " Cloning CosyVoice"
27+ pip3 install --upgrade x_transformers s3tokenizer
2728 git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git $cosyvoice_path
2829 cd $cosyvoice_path
2930 git submodule update --init --recursive
3031 cd runtime/triton_trtllm
3132fi
3233
3334if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
34- echo " "
35- # see https://github.com/nvidia-china-sae/mair-hub/blob/main/rl-tutorial/cosyvoice_llm/pretrained_to_huggingface.py
36- # huggingface-cli download --local-dir $huggingface_model_local_dir yuekai/cosyvoice2_llm
37- # modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_local_dir
38-
39- # pip3 install --upgrade x_transformers s3tokenizer
40- # pip install -U nvidia-modelopt[all]
41- python3 scripts/convert_cosyvoice3_to_hf.py \
42- --model-dir $model_scope_model_local_dir \
43- --output-dir $huggingface_model_local_dir || exit 1 # TODO: output dir should be here
44-
35+ echo " Downloading CosyVoice3 Checkpoints"
36+ huggingface-cli download --local-dir $huggingface_llm_local_dir yuekai/Fun-CosyVoice3-0.5B-2512-LLM-HF
37+ huggingface-cli download --local-dir $cosyvoice3_official_model_dir yuekai/Fun-CosyVoice3-0.5B-2512-FP16-ONNX
38+ huggingface-cli download --local-dir $cosyvoice3_official_model_dir FunAudioLLM/Fun-CosyVoice3-0.5B-2512
4539fi
4640
4741
4842if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
4943 echo " Converting checkpoint to TensorRT weights"
50- python3 scripts/convert_checkpoint.py --model_dir $huggingface_model_local_dir \
44+ python3 scripts/convert_checkpoint.py --model_dir $huggingface_llm_local_dir \
5145 --output_dir $trt_weights_dir \
5246 --dtype $trt_dtype || exit 1
5347
@@ -60,7 +54,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
6054
6155 echo " Testing TensorRT engines"
6256 python3 ./scripts/test_llm.py --input_text " 你好,请问你叫什么?" \
63- --tokenizer_dir $huggingface_model_local_dir \
57+ --tokenizer_dir $huggingface_llm_local_dir \
6458 --top_k 50 --top_p 0.95 --temperature 0.8 \
6559 --engine_dir=$trt_engines_dir || exit 1
6660fi
@@ -78,8 +72,8 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
7872 cp -r ${model_repo_src} /speaker_embedding $model_repo /
7973
8074 MAX_QUEUE_DELAY_MICROSECONDS=0
81- MODEL_DIR=$model_scope_model_local_dir
82- LLM_TOKENIZER_DIR=$huggingface_model_local_dir
75+ MODEL_DIR=$cosyvoice3_official_model_dir
76+ LLM_TOKENIZER_DIR=$huggingface_llm_local_dir
8377 BLS_INSTANCE_NUM=$bls_instance_num
8478 TRITON_MAX_BATCH_SIZE=1
8579 DECOUPLED_MODE=True
@@ -92,44 +86,16 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
9286
9387fi
9488
95- if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then
96- echo " Starting CosyVoice3 Triton server and LLM using trtllm-serve"
97- CUDA_VISIBLE_DEVICES=0 mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64 --kv_cache_free_gpu_memory_fraction 0.4
98- fi
99-
100-
101- if [ $stage -le 40 ] && [ $stop_stage -ge 40 ]; then
102-
103- CUDA_VISIBLE_DEVICES=1 tritonserver --model-repository $model_repo --http-port 18000 --grpc-port 18001 --metrics-port 18002 &
104- fi
105-
106-
10789if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
10890 echo " Starting CosyVoice3 Triton server and LLM using trtllm-serve"
109- CUDA_VISIBLE_DEVICES=0 mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64 --kv_cache_free_gpu_memory_fraction 0.4 &
110- CUDA_VISIBLE_DEVICES=0,1,2,3 tritonserver --model-repository $model_repo --http-port 18000 --grpc-port 18001 --metrics-port 18002 &
91+ CUDA_VISIBLE_DEVICES=0 mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_llm_local_dir $trt_engines_dir --max_batch_size 64 --kv_cache_free_gpu_memory_fraction 0.4 &
92+ CUDA_VISIBLE_DEVICES=0 tritonserver --model-repository $model_repo --http-port 18000 --grpc-port 18001 --metrics-port 18002 &
11193 wait
112- # Test using curl
113- # curl http://localhost:8000/v1/chat/completions \
114- # -H "Content-Type: application/json" \
115- # -d '{
116- # "model": "",
117- # "messages":[{"role": "user", "content": "Where is New York?"},
118- # {"role": "assistant", "content": "<|s_1708|><|s_2050|><|s_2159|>"}],
119- # "max_tokens": 512,
120- # "temperature": 0.8,
121- # "top_p": 0.95,
122- # "top_k": 50,
123- # "stop": ["<|eos1|>"],
124- # "repetition_penalty": 1.2,
125- # "stream": false
126- # }'
12794fi
12895
12996if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
13097 echo " Running benchmark client for CosyVoice3"
13198 num_task=4
132- mode=offline
13399 mode=streaming
134100 BLS_INSTANCE_NUM=$bls_instance_num
135101
@@ -145,102 +111,22 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
145111fi
146112
147113if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
148- echo " stage 5: Offline TTS (Cosyvoice2 LLM + Step-Audio2-mini DiT Token2Wav) inference using a single python script"
149-
150- datasets=(wenetspeech4tts) # wenetspeech4tts, test_zh, zero_shot_zh
151- backend=trtllm # hf, trtllm, vllm, trtllm-serve
152-
153- batch_sizes=(16)
154- token2wav_batch_size=1
155-
156- for batch_size in ${batch_sizes[@]} ; do
157- for dataset in ${datasets[@]} ; do
158- output_dir=./${dataset} _${backend} _llm_batch_size_${batch_size} _token2wav_batch_size_${token2wav_batch_size}
159- CUDA_VISIBLE_DEVICES=1 \
160- python3 offline_inference.py \
161- --output-dir $output_dir \
162- --llm-model-name-or-path $huggingface_model_local_dir \
163- --token2wav-path $step_audio_model_dir /token2wav \
164- --backend $backend \
165- --batch-size $batch_size --token2wav-batch-size $token2wav_batch_size \
166- --engine-dir $trt_engines_dir \
167- --split-name ${dataset} || exit 1
168- done
169- done
170- fi
171-
172-
173-
174-
175- if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
176- echo " Disaggregated Server: LLM and Token2wav on different GPUs"
177- echo " Starting LLM server on GPU 0"
178- export CUDA_VISIBLE_DEVICES=0
179- mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64 --kv_cache_free_gpu_memory_fraction 0.4 &
180- echo " Starting Token2wav server on GPUs 1-3"
181- Token2wav_num_gpus=3
182- http_port=17000
183- grpc_port=18000
184- metrics_port=16000
185- for i in $( seq 0 $(( $Token2wav_num_gpus - 1 )) ) ; do
186- echo " Starting server on GPU $i "
187- http_port=$(( http_port + 1 ))
188- grpc_port=$(( grpc_port + 1 ))
189- metrics_port=$(( metrics_port + 1 ))
190- # Two instances of Token2wav server on the same GPU
191- CUDA_VISIBLE_DEVICES=$(( $i + 1 )) tritonserver --model-repository $model_repo --http-port $http_port --grpc-port $grpc_port --metrics-port $metrics_port &
192- http_port=$(( http_port + 1 ))
193- grpc_port=$(( grpc_port + 1 ))
194- metrics_port=$(( metrics_port + 1 ))
195- CUDA_VISIBLE_DEVICES=$(( $i + 1 )) tritonserver --model-repository $model_repo --http-port $http_port --grpc-port $grpc_port --metrics-port $metrics_port &
196- done
197- wait
198- fi
199-
200- if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
201- echo " Running benchmark client for Disaggregated Server"
202- per_gpu_instances=2
203- mode=streaming
204- BLS_INSTANCE_NUM=$bls_instance_num
205- Token2wav_num_gpus=(1 2 3)
206- concurrent_tasks=(1 2 3 4 5 6)
207- for n_gpu in ${Token2wav_num_gpus[@]} ; do
208- echo " Test 1 GPU for LLM server and $n_gpu GPUs for Token2wav servers"
209- for concurrent_task in ${concurrent_tasks[@]} ; do
210- num_instances=$(( per_gpu_instances * n_gpu))
211- for i in $( seq 1 $num_instances ) ; do
212- port=$(( $i + 18000 ))
213- python3 client_grpc.py \
214- --server-addr localhost \
215- --server-port $port \
216- --model-name cosyvoice2_dit \
217- --num-tasks $concurrent_task \
218- --mode $mode \
219- --huggingface-dataset yuekai/seed_tts_cosy2 \
220- --log-dir ./log_disagg_concurrent_tasks_${concurrent_task} _per_instance_total_token2wav_instances_${num_instances} _port_${port} &
221- done
222- wait
223- done
224- done
225- fi
226-
227- if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
228114 echo " stage 10: Python script CosyVoice3 TTS (LLM + CosyVoice3 Token2Wav) inference"
229115
230116 datasets=(wenetspeech4tts) # wenetspeech4tts
231117 backend=trtllm-serve # hf, trtllm, vllm, trtllm-serve
232118
233119 batch_sizes=(1)
234- token2wav_batch_size=1
120+ token2wav_batch_size=1 # Only support 1 for now
235121
236122 for batch_size in ${batch_sizes[@]} ; do
237123 for dataset in ${datasets[@]} ; do
238124 output_dir=./cosyvoice3_${dataset} _${backend} _llm_batch_size_${batch_size} _token2wav_batch_size_${token2wav_batch_size} _streaming_trt
239125 CUDA_VISIBLE_DEVICES=0 \
240126 python3 infer_cosyvoice3.py \
241127 --output-dir $output_dir \
242- --llm-model-name-or-path $huggingface_model_local_dir \
243- --token2wav-path $model_scope_model_local_dir \
128+ --llm-model-name-or-path $huggingface_llm_local_dir \
129+ --token2wav-path $cosyvoice3_official_model_dir \
244130 --backend $backend \
245131 --batch-size $batch_size --token2wav-batch-size $token2wav_batch_size \
246132 --engine-dir $trt_engines_dir \
0 commit comments