mkdir tensorflowtts
nvidia-docker run --name tensorflowtts -p 7000:7000 -p 7001:7001 -v /home/root/tensorflowtts:/workspace -it langjingxiang/tensorflowtts:latest
docker exec -it containerID bash
import tensorflow as tf
import yaml
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa
import soundfile
from tensorflow_tts.inference import AutoConfig
from tensorflow_tts.inference import TFAutoModel
from tensorflow_tts.inference import AutoProcessor
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '/device:GPU:0'
tacotron2_config = AutoConfig.from_pretrained('./examples/tacotron2/conf/tacotron2.baker.v1.yaml')
tacotron2 = TFAutoModel.from_pretrained(
config=tacotron2_config,
pretrained_path="tacotron2-100k.h5",
name="tacotron2"
)
fastspeech2_config = AutoConfig.from_pretrained('./examples/fastspeech2/conf/fastspeech2.baker.v2.yaml')
fastspeech2 = TFAutoModel.from_pretrained(
config=fastspeech2_config,
pretrained_path="fastspeech2-200k.h5",
name="fastspeech2"
)
mb_melgan_config = AutoConfig.from_pretrained('./examples/multiband_melgan/conf/multiband_melgan.baker.v1.yaml')
mb_melgan = TFAutoModel.from_pretrained(
config=mb_melgan_config,
pretrained_path="mb.melgan-920k.h5",
name="mb_melgan"
)
processor = AutoProcessor.from_pretrained(pretrained_path="./baker_mapper.json")
def do_synthesis(input_text, text2mel_name):
input_ids = processor.text_to_sequence(input_text, inference=True)
if text2mel_name == "tacotron2":
_, mel_outputs, stop_token_prediction, alignment_history = tacotron2.inference(
tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
tf.convert_to_tensor([len(input_ids)], tf.int32),
tf.convert_to_tensor([0], dtype=tf.int32)
)
remove_end = 1024
elif text2mel_name == "fastspeech2":
mel_before, mel_outputs, duration_outputs, _, _ = fastspeech2.inference(
tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
)
remove_end = 1
else:
raise ValueError("Only TACOTRON, FASTSPEECH2 are supported on text2mel_name")
audio = mb_melgan.inference(mel_outputs)[0, :-remove_end, 0]
return audio.numpy()
from flask import Flask,request,jsonify
import json
import uuid
app = Flask(__name__)
@app.route('/')
def hello_world():
return 'Hello, World!'
@app.route('/api',methods=['POST'])
def text2wav():
data = json.loads(request.get_data(as_text=True))
text = data['text']
mode_name = data['mode_name']
id = uuid.uuid4()
audios = do_synthesis(text, mode_name)
sr=24000
soundfile.write("./static/{0}.wav".format(id), audios, sr)
return "http://172.31.18.90:7001/static/{0}.wav".format(id)
if __name__ == '__main__':
app.run(port=7001,host='0.0.0.0')
{
"text":"你好",
"mode":"FASTSPEECH2"
}