Skip to content

Commit c7686fa

Browse files
author
root
committed
rename files
1 parent 8f0b28b commit c7686fa

7 files changed

Lines changed: 76 additions & 190 deletions

runtime/triton_trtllm/README.DIT.md renamed to runtime/triton_trtllm/README.Cosyvoice2.DiT.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ This document describes how to accelerate CosyVoice with a DiT-based Token2Wav m
88

99
Launch the service directly with Docker Compose:
1010
```sh
11-
docker compose -f docker-compose.dit.yml up
11+
docker compose -f docker-compose.cosyvoice2.dit.yml up
1212
```
1313

1414
### Build the Docker Image

runtime/triton_trtllm/README.md renamed to runtime/triton_trtllm/README.Cosyvoice2.Unet.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ Contributed by Yuekai Zhang (NVIDIA).
66

77
Launch the service directly with Docker Compose:
88
```sh
9-
docker compose up
9+
docker compose -f docker-compose.cosyvoice2.unet.yml up
1010
```
1111

1212
### Build the Docker Image
File renamed without changes.
File renamed without changes.

runtime/triton_trtllm/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ pyworld
1212
openai-whisper
1313
tritonclient
1414
modelscope
15+
x_transformers

runtime/triton_trtllm/run_cosyvoice3.sh

Lines changed: 21 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -10,44 +10,38 @@ export PYTHONPATH=${cosyvoice_path}/third_party/Matcha-TTS:$PYTHONPATH
1010
stage=$1
1111
stop_stage=$2
1212

13-
huggingface_model_local_dir=./hf_cosyvoice3_llm
14-
model_scope_model_local_dir=/workspace_yuekai/HF/Fun-CosyVoice3-0.5B-2512
13+
huggingface_llm_local_dir=$cosyvoice_path/runtime/triton_trtllm/hf_cosyvoice3_llm
14+
cosyvoice3_official_model_dir=$cosyvoice_path/runtime/triton_trtllm/Fun-CosyVoice3-0.5B-2512
1515

1616
trt_dtype=bfloat16
17-
trt_weights_dir=./trt_weights_${trt_dtype}
18-
trt_engines_dir=./trt_engines_${trt_dtype}
17+
trt_weights_dir=$cosyvoice_path/runtime/triton_trtllm/trt_weights_${trt_dtype}
18+
trt_engines_dir=$cosyvoice_path/runtime/triton_trtllm/trt_engines_${trt_dtype}
1919

20-
model_repo_src=./model_repo_cosyvoice3
21-
model_repo=./deploy_cosyvoice3
22-
bls_instance_num=1
20+
model_repo_src=$cosyvoice_path/runtime/triton_trtllm/model_repo_cosyvoice3
21+
model_repo=$cosyvoice_path/runtime/triton_trtllm/model_repo_cosyvoice3_copy
22+
bls_instance_num=10
2323

2424
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
2525

2626
echo "Cloning CosyVoice"
27+
pip3 install --upgrade x_transformers s3tokenizer
2728
git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git $cosyvoice_path
2829
cd $cosyvoice_path
2930
git submodule update --init --recursive
3031
cd runtime/triton_trtllm
3132
fi
3233

3334
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
34-
echo ""
35-
# see https://github.com/nvidia-china-sae/mair-hub/blob/main/rl-tutorial/cosyvoice_llm/pretrained_to_huggingface.py
36-
# huggingface-cli download --local-dir $huggingface_model_local_dir yuekai/cosyvoice2_llm
37-
# modelscope download --model iic/CosyVoice2-0.5B --local_dir $model_scope_model_local_dir
38-
39-
# pip3 install --upgrade x_transformers s3tokenizer
40-
# pip install -U nvidia-modelopt[all]
41-
python3 scripts/convert_cosyvoice3_to_hf.py \
42-
--model-dir $model_scope_model_local_dir \
43-
--output-dir $huggingface_model_local_dir || exit 1 # TODO: output dir should be here
44-
35+
echo "Downloading CosyVoice3 Checkpoints"
36+
huggingface-cli download --local-dir $huggingface_llm_local_dir yuekai/Fun-CosyVoice3-0.5B-2512-LLM-HF
37+
huggingface-cli download --local-dir $cosyvoice3_official_model_dir yuekai/Fun-CosyVoice3-0.5B-2512-FP16-ONNX
38+
huggingface-cli download --local-dir $cosyvoice3_official_model_dir FunAudioLLM/Fun-CosyVoice3-0.5B-2512
4539
fi
4640

4741

4842
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
4943
echo "Converting checkpoint to TensorRT weights"
50-
python3 scripts/convert_checkpoint.py --model_dir $huggingface_model_local_dir \
44+
python3 scripts/convert_checkpoint.py --model_dir $huggingface_llm_local_dir \
5145
--output_dir $trt_weights_dir \
5246
--dtype $trt_dtype || exit 1
5347

@@ -60,7 +54,7 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
6054

6155
echo "Testing TensorRT engines"
6256
python3 ./scripts/test_llm.py --input_text "你好,请问你叫什么?" \
63-
--tokenizer_dir $huggingface_model_local_dir \
57+
--tokenizer_dir $huggingface_llm_local_dir \
6458
--top_k 50 --top_p 0.95 --temperature 0.8 \
6559
--engine_dir=$trt_engines_dir || exit 1
6660
fi
@@ -78,8 +72,8 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
7872
cp -r ${model_repo_src}/speaker_embedding $model_repo/
7973

8074
MAX_QUEUE_DELAY_MICROSECONDS=0
81-
MODEL_DIR=$model_scope_model_local_dir
82-
LLM_TOKENIZER_DIR=$huggingface_model_local_dir
75+
MODEL_DIR=$cosyvoice3_official_model_dir
76+
LLM_TOKENIZER_DIR=$huggingface_llm_local_dir
8377
BLS_INSTANCE_NUM=$bls_instance_num
8478
TRITON_MAX_BATCH_SIZE=1
8579
DECOUPLED_MODE=True
@@ -92,44 +86,16 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
9286

9387
fi
9488

95-
if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then
96-
echo "Starting CosyVoice3 Triton server and LLM using trtllm-serve"
97-
CUDA_VISIBLE_DEVICES=0 mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64 --kv_cache_free_gpu_memory_fraction 0.4
98-
fi
99-
100-
101-
if [ $stage -le 40 ] && [ $stop_stage -ge 40 ]; then
102-
103-
CUDA_VISIBLE_DEVICES=1 tritonserver --model-repository $model_repo --http-port 18000 --grpc-port 18001 --metrics-port 18002 &
104-
fi
105-
106-
10789
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
10890
echo "Starting CosyVoice3 Triton server and LLM using trtllm-serve"
109-
CUDA_VISIBLE_DEVICES=0 mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64 --kv_cache_free_gpu_memory_fraction 0.4 &
110-
CUDA_VISIBLE_DEVICES=0,1,2,3 tritonserver --model-repository $model_repo --http-port 18000 --grpc-port 18001 --metrics-port 18002 &
91+
CUDA_VISIBLE_DEVICES=0 mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_llm_local_dir $trt_engines_dir --max_batch_size 64 --kv_cache_free_gpu_memory_fraction 0.4 &
92+
CUDA_VISIBLE_DEVICES=0 tritonserver --model-repository $model_repo --http-port 18000 --grpc-port 18001 --metrics-port 18002 &
11193
wait
112-
# Test using curl
113-
# curl http://localhost:8000/v1/chat/completions \
114-
# -H "Content-Type: application/json" \
115-
# -d '{
116-
# "model": "",
117-
# "messages":[{"role": "user", "content": "Where is New York?"},
118-
# {"role": "assistant", "content": "<|s_1708|><|s_2050|><|s_2159|>"}],
119-
# "max_tokens": 512,
120-
# "temperature": 0.8,
121-
# "top_p": 0.95,
122-
# "top_k": 50,
123-
# "stop": ["<|eos1|>"],
124-
# "repetition_penalty": 1.2,
125-
# "stream": false
126-
# }'
12794
fi
12895

12996
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
13097
echo "Running benchmark client for CosyVoice3"
13198
num_task=4
132-
mode=offline
13399
mode=streaming
134100
BLS_INSTANCE_NUM=$bls_instance_num
135101

@@ -145,102 +111,22 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
145111
fi
146112

147113
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
148-
echo "stage 5: Offline TTS (Cosyvoice2 LLM + Step-Audio2-mini DiT Token2Wav) inference using a single python script"
149-
150-
datasets=(wenetspeech4tts) # wenetspeech4tts, test_zh, zero_shot_zh
151-
backend=trtllm # hf, trtllm, vllm, trtllm-serve
152-
153-
batch_sizes=(16)
154-
token2wav_batch_size=1
155-
156-
for batch_size in ${batch_sizes[@]}; do
157-
for dataset in ${datasets[@]}; do
158-
output_dir=./${dataset}_${backend}_llm_batch_size_${batch_size}_token2wav_batch_size_${token2wav_batch_size}
159-
CUDA_VISIBLE_DEVICES=1 \
160-
python3 offline_inference.py \
161-
--output-dir $output_dir \
162-
--llm-model-name-or-path $huggingface_model_local_dir \
163-
--token2wav-path $step_audio_model_dir/token2wav \
164-
--backend $backend \
165-
--batch-size $batch_size --token2wav-batch-size $token2wav_batch_size \
166-
--engine-dir $trt_engines_dir \
167-
--split-name ${dataset} || exit 1
168-
done
169-
done
170-
fi
171-
172-
173-
174-
175-
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
176-
echo "Disaggregated Server: LLM and Token2wav on different GPUs"
177-
echo "Starting LLM server on GPU 0"
178-
export CUDA_VISIBLE_DEVICES=0
179-
mpirun -np 1 --allow-run-as-root --oversubscribe trtllm-serve serve --tokenizer $huggingface_model_local_dir $trt_engines_dir --max_batch_size 64 --kv_cache_free_gpu_memory_fraction 0.4 &
180-
echo "Starting Token2wav server on GPUs 1-3"
181-
Token2wav_num_gpus=3
182-
http_port=17000
183-
grpc_port=18000
184-
metrics_port=16000
185-
for i in $(seq 0 $(($Token2wav_num_gpus - 1))); do
186-
echo "Starting server on GPU $i"
187-
http_port=$((http_port + 1))
188-
grpc_port=$((grpc_port + 1))
189-
metrics_port=$((metrics_port + 1))
190-
# Two instances of Token2wav server on the same GPU
191-
CUDA_VISIBLE_DEVICES=$(($i + 1)) tritonserver --model-repository $model_repo --http-port $http_port --grpc-port $grpc_port --metrics-port $metrics_port &
192-
http_port=$((http_port + 1))
193-
grpc_port=$((grpc_port + 1))
194-
metrics_port=$((metrics_port + 1))
195-
CUDA_VISIBLE_DEVICES=$(($i + 1)) tritonserver --model-repository $model_repo --http-port $http_port --grpc-port $grpc_port --metrics-port $metrics_port &
196-
done
197-
wait
198-
fi
199-
200-
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
201-
echo "Running benchmark client for Disaggregated Server"
202-
per_gpu_instances=2
203-
mode=streaming
204-
BLS_INSTANCE_NUM=$bls_instance_num
205-
Token2wav_num_gpus=(1 2 3)
206-
concurrent_tasks=(1 2 3 4 5 6)
207-
for n_gpu in ${Token2wav_num_gpus[@]}; do
208-
echo "Test 1 GPU for LLM server and $n_gpu GPUs for Token2wav servers"
209-
for concurrent_task in ${concurrent_tasks[@]}; do
210-
num_instances=$((per_gpu_instances * n_gpu))
211-
for i in $(seq 1 $num_instances); do
212-
port=$(($i + 18000))
213-
python3 client_grpc.py \
214-
--server-addr localhost \
215-
--server-port $port \
216-
--model-name cosyvoice2_dit \
217-
--num-tasks $concurrent_task \
218-
--mode $mode \
219-
--huggingface-dataset yuekai/seed_tts_cosy2 \
220-
--log-dir ./log_disagg_concurrent_tasks_${concurrent_task}_per_instance_total_token2wav_instances_${num_instances}_port_${port} &
221-
done
222-
wait
223-
done
224-
done
225-
fi
226-
227-
if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
228114
echo "stage 10: Python script CosyVoice3 TTS (LLM + CosyVoice3 Token2Wav) inference"
229115

230116
datasets=(wenetspeech4tts) # wenetspeech4tts
231117
backend=trtllm-serve # hf, trtllm, vllm, trtllm-serve
232118

233119
batch_sizes=(1)
234-
token2wav_batch_size=1
120+
token2wav_batch_size=1 # Only support 1 for now
235121

236122
for batch_size in ${batch_sizes[@]}; do
237123
for dataset in ${datasets[@]}; do
238124
output_dir=./cosyvoice3_${dataset}_${backend}_llm_batch_size_${batch_size}_token2wav_batch_size_${token2wav_batch_size}_streaming_trt
239125
CUDA_VISIBLE_DEVICES=0 \
240126
python3 infer_cosyvoice3.py \
241127
--output-dir $output_dir \
242-
--llm-model-name-or-path $huggingface_model_local_dir \
243-
--token2wav-path $model_scope_model_local_dir \
128+
--llm-model-name-or-path $huggingface_llm_local_dir \
129+
--token2wav-path $cosyvoice3_official_model_dir \
244130
--backend $backend \
245131
--batch-size $batch_size --token2wav-batch-size $token2wav_batch_size \
246132
--engine-dir $trt_engines_dir \

0 commit comments

Comments
 (0)