Add example scripts for monologue TTS.

This commit is contained in:
xiehanke
2025-10-30 23:10:48 +08:00
parent 18915ecd68
commit a32b4ec3b8
2 changed files with 107 additions and 0 deletions

83
cli/tts.py Normal file
View File

@@ -0,0 +1,83 @@
import os
import json
import torch
import argparse
import s3tokenizer
import soundfile as sf
from soulxpodcast.config import SamplingParams
from soulxpodcast.utils.parser import podcast_format_parser
from soulxpodcast.utils.infer_utils import initiate_model, process_single_input
def run_inference(
inputs: dict,
model_path: str,
output_path: str,
llm_engine: str = "hf",
fp16_flow: bool = False,
seed: int = 1988,
):
model, dataset = initiate_model(seed, model_path, llm_engine, fp16_flow)
data = process_single_input(
dataset,
inputs['text'],
inputs['prompt_wav'],
inputs['prompt_text'],
inputs['use_dialect_prompt'],
inputs['dialect_prompt_text'],
)
print("[INFO] Start inference...")
results_dict = model.forward_longform(**data)
target_audio = None
for wav in results_dict["generated_wavs"]:
if target_audio is None:
target_audio = wav
else:
target_audio = torch.cat([target_audio, wav], dim=1)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
sf.write(output_path, target_audio.cpu().squeeze(0).numpy(), 24000)
print(f"[INFO] Saved synthesized audio to: {output_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser()
parser.add_argument("--text", required=True, help="The text used for generating real audio")
parser.add_argument("--prompt_text", required=True, help="The text used for prompt")
parser.add_argument("--dialect_prompt", default="", help="The prompt dialect text used for prompt")
parser.add_argument("--prompt_audio", required=True, help="Path to the input JSON file")
parser.add_argument("--model_path", required=True, help="Path to the model file")
parser.add_argument("--output_path", default="outputs/result.wav", help="Path to the output audio file")
parser.add_argument("--llm_engine", default="hf", choices=["hf", "vllm"], help="Inference engine to use")
parser.add_argument("--fp16_flow", action="store_true", help="Enable FP16 flow")
parser.add_argument("--seed", type=int, default=1988, help="Random seed")
args = parser.parse_args()
data = {
"speakers":{
"S1":{
"prompt_audio": args.prompt_audio,
"prompt_text": args.prompt_text,
"dialect_prompt": args.dialect_prompt,
}
},
"text": [
["S1", args.text]
]
}
inputs = podcast_format_parser(data)
run_inference(
inputs=inputs,
model_path=args.model_path,
output_path=args.output_path,
llm_engine=args.llm_engine,
fp16_flow=args.fp16_flow,
seed=args.seed,
)

24
example/infer_tts.sh Normal file
View File

@@ -0,0 +1,24 @@
export PYTHONPATH="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)"
echo "PYTHONPATH set to: $PYTHONPATH"
# Note: To infer Chinese dialects, set model_dir to "pretrained_models/SoulX-Podcast-1.7B-dialect
# and follow example/podcast_script/script_mandarin to set dialect_prompt and pass it to
# entrance tts.py.
# Example: infer chinese sichuan dialects with mandarin prompt:
# dialect_prompt="<|Sichuan|>要得要得!前头几个耍洋盘,我后脚就背起铺盖卷去景德镇耍泥巴,巴适得喊老天爷!"
# text="<|Sichuan|>一个着迷于在岩壁与雪原间捕捉语言灵感的旅人,即将奔赴景德镇将朝露炊烟揉进陶胚的造梦者。"
model_dir=pretrained_models/SoulX-Podcast-1.7B
prompt_text="喜欢攀岩、徒步、滑雪的语言爱好者,以及过两天要带着全部家当去景德镇做陶瓷的白日梦想家。"
dialect_prompt=""
prompt_audio="example/audios/female_mandarin.wav"
text="一个着迷于在岩壁与雪原间捕捉语言灵感的旅人,即将奔赴景德镇将朝露炊烟揉进陶胚的造梦者。"
python cli/tts.py \
--prompt_text ${prompt_text} \
--dialect_prompt "${dialect_prompt:-}" \
--prompt_audio ${prompt_audio} \
--text ${text} \
--model_path ${model_dir} \
--output_path outputs/mandarin_tts.wav \
--seed 7