使用react对接百度语音实时识别

0 阅读3分钟

最近公司需要做一个ai对话网站,有语音输入的需求,对接百度实时语音识别,记录下实现过程

直接上干货

因为用的react,一开始我就想到了hooks,自定义一个useBaiduAsr方法,透出text start send end参数,其中text是识别后的文字,其余都是方法,可针对指定场景来调用

import { useCallback, useEffect, useRef, useState } from 'react'

export const useBaiduAsr = (apiKey, appId) => {
  const [text, setText] = useState('')
  const [resultObjs, setResultObjs] = useState([])
  const wsRef = useRef(null)

  // 开始语音识别
  const startStreamingRecognition = useCallback(async () => {
    try {
      const uuid = crypto.randomUUID()
      const wsUrl = `wss://vop.baidu.com/realtime_asr?sn=${uuid}`
      wsRef.current = new WebSocket(wsUrl)

      wsRef.current.onmessage = (event) => {
        try {
          const data = JSON.parse(event.data)
          if (data.type === "MID_TEXT" || data.type === "FIN_TEXT") {
            const { result, start_time } = data
            setResultObjs(pre => [...pre, { result, start_time }])
          }
        } catch (error) {
          console.error('处理消息时发生错误:', error)
        }
      }

      return new Promise((resolve, reject) => {
        wsRef.current.onopen = () => {
          console.log('WebSocket连接已建立')
          const startParams = {
            type: 'START',
            data: {
              appid: appId,
              appkey: apiKey,
              dev_pid: 1537,
              format: "pcm",
              sample: 16000,
              cuid: 'react-app'
            }
          }
          wsRef.current.send(JSON.stringify(startParams))
          resolve(wsRef.current)
        }

        wsRef.current.onerror = (error) => {
          console.error('WebSocket错误:', error)
          reject(error)
        }

        wsRef.current.onclose = () => {
          console.log('WebSocket连接已关闭')
        }
      })
    } catch (error) {
      console.error('创建WebSocket连接失败:', error)
      throw error
    }
  }, [apiKey])

  // 发送音频数据
  const sendAudioData = (audioData) => {
    if (!wsRef.current) {
      console.error('WebSocket未连接')
      return
    }
    if (wsRef.current.readyState === WebSocket.OPEN) {
      wsRef.current.send(audioData)
    } else {
      // console.error('WebSocket未就绪,当前状态:', wsRef.current.readyState)
    }
  }

  // 停止语音识别
  const stopStreamingRecognition = useCallback(() => {
    if (!wsRef.current) {
      return
    }
    try {
      const finishParams = {
        type: 'FINISH'
      }
      wsRef.current.send(JSON.stringify(finishParams))
      wsRef.current.close()
      setText('')
      setResultObjs([])
    } catch (error) {
      console.error('关闭WebSocket时发生错误:', error)
    }
  }, [])

  useEffect(() => {
    const result = []
    let tempGroup = []
    resultObjs.forEach((item, index) => {
      if (index === 0 || item.start_time === resultObjs[index - 1].start_time) {
        tempGroup.push(item.result)
      } else {
        result.push(tempGroup)
        tempGroup = [item.result]
      }
    })
    if (tempGroup.length) {
      result.push(tempGroup)
    }
    if (result.length) {
      const text = result.map(item => item[item.length - 1]).join('')
      setText(text)
    }
  }, [resultObjs])

  return {
    text,
    start: startStreamingRecognition,
    send: sendAudioData,
    stop: stopStreamingRecognition,
  }
} 

解析下代码: 百度语音实时识别使用ws来连接,在连接成功后需要手动触发一次开始任务参数,在识别成功后会返回一个识别内容对象,但是内容可能会重复,百度的策略是文字累加的,那我们该如何消费生成的文字?

我的思路是使用resultObjs数组来接收返回的内容,如果语音被识别为连续的一段话,他们的start_time字段是相同的,通过这个可将相同的start_time的result放到一个二维数组中,下层每个数组代表一个段落,段落数组中每个元素代表识别过程中的内容,我们直接取最后一个即可,这样有个好处就是,实时识别后等最后拿到结果有一个纠正的效果。

[
  ['你', '你好', '你好啊', '你好啊!' ],
  ['我', '我可', '我可以', '我可以和你', '我可以和你一起吗?']
]

将最后一个取出后拼接在一起即可 下面看我们的组件如何消费这个hook

import { Button } from 'antd'
import React, { useEffect, useRef, useState } from 'react'

import { useBaiduAsr } from '@/hooks/useBaiduAsr'

import IconFont from '../IconFont'
import css from './index.module.scss'

const VoiceRecognition = ({ onStop, onTextChange }) => {
  const [isRecording, setIsRecording] = useState(false)
  const mediaStreamRef = useRef(null)
  const audioProcessorRef = useRef(null)
  const { text, start, send, stop } = useBaiduAsr(YOUR_API_KEY, YOUR_APP_ID)

  const startRecording = async () => {
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
      const audioContext = new AudioContext({ sampleRate: 16000 })
      const source = audioContext.createMediaStreamSource(stream)
      const processor = audioContext.createScriptProcessor(4096, 1, 1)

      source.connect(processor)
      processor.connect(audioContext.destination)
      // 初始化百度流式识别
      await start()

      processor.onaudioprocess = (e) => {
        try {
          const audioData = e.inputBuffer.getChannelData(0)
          const pcmData = float32ToPCM(audioData)
          send(pcmData)
        } catch (e) {
          console.log('send error', e)
          stopRecording()
        }
      }

      mediaStreamRef.current = stream
      audioProcessorRef.current = processor
      setIsRecording(true)
    } catch (err) {
      console.log(err)
    }
  }

  const stopRecording = async () => {
    onStop?.()
    stop()
    if (audioProcessorRef.current) {
      audioProcessorRef.current.disconnect()
      audioProcessorRef.current = null
    }

    if (mediaStreamRef.current) {
      mediaStreamRef.current.getTracks().forEach((track) => track.stop())
      mediaStreamRef.current = null
    }

    setIsRecording(false)
  }

  // 转换Float32音频数据为Int16
  const float32ToPCM = (float32Array) => {
    const pcm16Array = new Int16Array(float32Array.length)
    for (let i = 0; i < float32Array.length; i++) {
      pcm16Array[i] = Math.max(-32768, Math.min(32767, float32Array[i] * 32768))
    }
    return pcm16Array.buffer
  }

  useEffect(() => {
    return () => {
      stopRecording()
    }
  }, [])

  useEffect(() => {
    onTextChange(text)
  }, [text, onTextChange])

  return (
    <>
      {isRecording ? (
        <Button type="text" size="middle" className={css['voice-btn']} onClick={stopRecording}>
          <IconFont className={css['search-voice-listen']} type="icon-mic-off" />
        </Button>
      ) : (
        <Button type="text" size="middle" className={css['voice-btn']} onClick={startRecording}>
          <IconFont className={css['search-voice']} type="icon-mic-on" />
        </Button>
      )}
    </>
  )
}

export default VoiceRecognition

这里的重点是关于语音切片的一些逻辑和api调用以及音频的格式转换,我就不赘述了,你们自己看代码就好,在消费这个组件时,需要注意传入的onStop方法中需要把text赋值到input的value中

export defualt function Chat() {
    const [prompt, setPrompt] = useState('')
    const [voiceText, setVoiceText] = useState('')
    
    const handleVoiceStop = () => {
        setPrompt(pre => pre + voiceText)
        setVoiceText('')
    }
    
    const handleText = (text) => {
       setVoiceText(text)
    }
    return (
        <Input 
            value={prompt + voiceText}
            suffix={<VoiceRecognition onStop={handleVoiceStop} onTextChange={handleText} />} 
        />
    )
}

目前暂时就是这样,后面还有ws断线重连和语音续接的需求,后续再更新了。