前端AI智能-阿里云智能语音-语音识别

77 阅读2分钟

需求: 实时语音识别

实现步骤

-前置条件

-官方文档

主要逻辑代码


  const audioContext = ref(null)
  const audioProcessor = ref(null)
  
  let pcmBuffer = new Int16Array()
  const recordingStatus = ref(null)
  const recordings = ref([])
  let websocket = null
  const taskId = ref('')

  // 32位标识
const generateUniqueID = () => {
    return Array(32)
      .fill(0)
      .map(() => Math.floor(Math.random() * 16).toString(16))
      .join('');
  }
  const onTextChanged = (text) => {
    const changeText = recordings.value || []
    const count = changeText.length;
    if (count > 0) {
      changeText[count - 1] = text
    } else {
      changeText[0] = text
    }
  
    recordings.value = changeText
  }
  
  const onTextFinished = (text) => {
    const finishText = recordings.value || []
    const count = finishText.length;
    if (count > 0) {
      finishText[count - 1] = text
    } else {
      finishText[0] = text
    }
    recordings.value = finishText

  }
  
  // 实现16000音频采样率
  const resampleAudioBuffer = async (audioBuffer, targetSampleRate) => {
    const numberOfChannels = audioBuffer.numberOfChannels;
    const offlineContext = new OfflineAudioContext(
      numberOfChannels,
      audioBuffer.duration * targetSampleRate,
      targetSampleRate
    );
  
    const bufferSource = offlineContext.createBufferSource();
    bufferSource.buffer = audioBuffer;
    bufferSource.connect(offlineContext.destination);
    bufferSource.start();
  
    const resampledAudioBuffer = await offlineContext.startRendering();
    return resampledAudioBuffer;
  }
  const startCaptureAudio = async () => {
    recordingStatus.value = '可以开始录音了!'
    audioContext.value = new AudioContext();
    const sampleRate = audioContext.value.sampleRate;
    // console.log("Original sampleRate: ", sampleRate);
  
    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
    const source = audioContext.value.createMediaStreamSource(stream);
  
    audioProcessor.value = audioContext.value.createScriptProcessor(1024, 1, 1);
    audioProcessor.value.onaudioprocess = async (event) => {
      const inputBuffer = event.inputBuffer;
  
      // Resample the buffer if needed
      let resampledBuffer = inputBuffer;
      if (sampleRate !== 16000) {
        resampledBuffer = await resampleAudioBuffer(inputBuffer, 16000);
      }
  
      const pcmData = convertToPCM(resampledBuffer);
      pcmBuffer = concatenateTypedArrays(pcmBuffer, new Int16Array(pcmData));
  
      while (pcmBuffer.length * 2 >= 3200) {
        const chunk = pcmBuffer.slice(0, 3200 / 2);
        websocket.send(chunk.buffer);
        pcmBuffer = pcmBuffer.slice(3200 / 2);
      }
    };
  
    source.connect(audioProcessor.value);
    audioProcessor.value.connect(audioContext.value.destination);
  }
  
  const prepareRecording = async () => {
    recordingStatus.value = '正在连接阿里云...'
    const APPKEY = import.meta.env.VITE_ALI_APP_KEY
    const TOKEN = import.meta.env.VITE_ALI_TOKEN
    const URL = `${import.meta.env.VITE_ALI_PATH}?token=${TOKEN}`
    taskId.value = taskId.value == '' ? generateUniqueID() : taskId.value
    websocket = new WebSocket(URL);
    websocket.onopen = () => {
      const instruction = {
        "header": {
          "appkey": APPKEY,
          "message_id": generateUniqueID(),
          // "task_id": generateUniqueID(),
          "task_id": taskId.value,
          "namespace": "SpeechTranscriber",
          "name": "StartTranscription"
        },
        "payload": {
          "format": "PCM",
          "sample_rate": 16000,
          "enable_intermediate_result": true,
          "enable_punctuation_prediction": true,
          "enable_inverse_text_normalization": true,
          "max_sentence_silence": 1000,
          "disfluency": true,
          "speech_noise_threshold": 0.8,
          "customization_id": "029df699e89344a19baf8d2d4bd6d903",
          "vocabulary_id": "6ca11494525a43fe9b337e8db8ed522f"
        }
      }
      if (websocket.readyState == 1) {
        websocket.send(JSON.stringify(instruction));
      }
    };
  
    websocket.onmessage = (e) => {
      const ret = JSON.parse(e.data);
      console.log(ret, 12313)
      if (ret.header.name === 'TranscriptionResultChanged') {
        onTextChanged(ret.payload.result);
      } else if (ret.header.name === 'SentenceBegin') {
        const beginText = recordings.value || [];
        beginText.push("")
        recordings.value = beginText;
      } else if (ret.header.name === 'TranscriptionStarted') {
        startCaptureAudio();
      } else if (ret.header.name === 'SentenceEnd') {
        console.log(ret.payload.result)
        // 识别结果
       
        
        onTextFinished(ret.payload.result);
        stopRecording()
      } else if (ret.header.name === 'TranscriptionCompleted') {
        console.log('服务端已停止了语音转写', ret);
      } else {
        console.log("response: ", e)
        prepareRecording()
      }
    };
  }
  
  const convertToPCM = (audioBuffer) => {
    let float32Array = audioBuffer.getChannelData(0);
    let pcmData = new Int16Array(float32Array.length);
  
    for (let i = 0; i < float32Array.length; i++) {
      let s = Math.max(-1, Math.min(1, float32Array[i]));
      pcmData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
    }
  
    return pcmData.buffer;
  }
  
  const concatenateTypedArrays = (a, b) => {
    const result = new a.constructor(a.length + b.length);
    result.set(a);
    result.set(b, a.length);
    return result;
  }
  
  const stopRecording = () => {
    recordingStatus.value = '已停止录音'
    console.log('停止录音')
    if (audioProcessor.value) {
      audioProcessor.value.disconnect();
      audioProcessor.value = null;
    }
    if (websocket) {
      console.log('关闭长链接')
      websocket.close();
      websocket = null;
    }
  }

  export { prepareRecording, stopRecording, taskId }

中间如有问题请私聊,懒了 不想写解释了 ,,ԾㅂԾ,,
需要注意的是周边环境嘈杂识别不准

1.speech_noise_threshold简单除燥。

2.去管控台创建热词以及定制训练语言模型.