目标
这里只是serving
且模型直接引入huggingface的包
huggingface
可以理解为模型超市,可以根据业务需求,挑选可用模型
当然,你可以自己去训练一个模型
我选了微软的tts模型
代码
from datasets import load_dataset
import soundfile as sf
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# You can replace this embedding with your own as well.
speech = synthesiser("Hello, my dog is cooler than you!", forward_params={"speaker_embeddings": speaker_embedding})
sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
改用本地模型的代码
我有预训练好的模型,放在本地也可以直接用
import soundfile as sf
from transformers import SpeechT5ForConditionalGeneration, Speech2TextProcessor
model = SpeechT5ForConditionalGeneration.from_pretrained("path/to/speecht5_base.pt")
processor = Speech2TextProcessor.from_pretrained("path/to/speecht5_base.pt")
text = "Hello, my dog is cooler than you!"
speaker_id = 7306
input_ids = processor(text, return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)
speaker_embedding = torch.tensor(embeddings_dataset[speaker_id]["xvector"]).unsqueeze(0).to(model.device)
output = model.generate(input_ids, speaker_embeds=speaker_embedding)
audio = output[0].detach().cpu().numpy()
sampling_rate = 22050
sf.write("speech.wav", audio, samplerate=sampling_rate)