tts

482 阅读1分钟
mkdir tensorflowtts
nvidia-docker run --name tensorflowtts -p 7000:7000 -p 7001:7001 -v /home/root/tensorflowtts:/workspace -it langjingxiang/tensorflowtts:latest
docker exec -it containerID bash
import tensorflow as tf

import yaml
import numpy as np
import matplotlib.pyplot as plt

import IPython.display as ipd
import librosa
import soundfile

from tensorflow_tts.inference import AutoConfig
from tensorflow_tts.inference import TFAutoModel
from tensorflow_tts.inference import AutoProcessor

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '/device:GPU:0'

tacotron2_config = AutoConfig.from_pretrained('./examples/tacotron2/conf/tacotron2.baker.v1.yaml')
tacotron2 = TFAutoModel.from_pretrained(
    config=tacotron2_config,
    pretrained_path="tacotron2-100k.h5",
    name="tacotron2"
)

fastspeech2_config = AutoConfig.from_pretrained('./examples/fastspeech2/conf/fastspeech2.baker.v2.yaml')
fastspeech2 = TFAutoModel.from_pretrained(
    config=fastspeech2_config,
    pretrained_path="fastspeech2-200k.h5",
    name="fastspeech2"
)

mb_melgan_config = AutoConfig.from_pretrained('./examples/multiband_melgan/conf/multiband_melgan.baker.v1.yaml')
mb_melgan = TFAutoModel.from_pretrained(
    config=mb_melgan_config,
    pretrained_path="mb.melgan-920k.h5",
    name="mb_melgan"
)

processor = AutoProcessor.from_pretrained(pretrained_path="./baker_mapper.json")


def do_synthesis(input_text, text2mel_name):
  input_ids = processor.text_to_sequence(input_text, inference=True)

  if text2mel_name == "tacotron2":
    _, mel_outputs, stop_token_prediction, alignment_history = tacotron2.inference(
        tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
        tf.convert_to_tensor([len(input_ids)], tf.int32),
        tf.convert_to_tensor([0], dtype=tf.int32)
    )
    remove_end = 1024
  elif text2mel_name == "fastspeech2":
    mel_before, mel_outputs, duration_outputs, _, _ = fastspeech2.inference(
        tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
        speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
        speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
        f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
        energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
    )
    remove_end = 1
  else:
    raise ValueError("Only TACOTRON, FASTSPEECH2 are supported on text2mel_name")
  audio = mb_melgan.inference(mel_outputs)[0, :-remove_end, 0]
  return audio.numpy()
from flask import Flask,request,jsonify
import json
import uuid
app = Flask(__name__)

@app.route('/')
def hello_world():
    return 'Hello, World!'

@app.route('/api',methods=['POST'])
def text2wav():
    data = json.loads(request.get_data(as_text=True))
    text = data['text']
    mode_name = data['mode_name']
    id = uuid.uuid4()
    audios = do_synthesis(text, mode_name)
    sr=24000
    soundfile.write("./static/{0}.wav".format(id), audios, sr)
    return "http://172.31.18.90:7001/static/{0}.wav".format(id)
if __name__ == '__main__':
    app.run(port=7001,host='0.0.0.0')
{
  "text":"你好""mode":"FASTSPEECH2"
}