需求: 实时语音识别
实现步骤
-前置条件
-官方文档
主要逻辑代码
const audioContext = ref(null)
const audioProcessor = ref(null)
let pcmBuffer = new Int16Array()
const recordingStatus = ref(null)
const recordings = ref([])
let websocket = null
const taskId = ref('')
// 32位标识
const generateUniqueID = () => {
return Array(32)
.fill(0)
.map(() => Math.floor(Math.random() * 16).toString(16))
.join('');
}
const onTextChanged = (text) => {
const changeText = recordings.value || []
const count = changeText.length;
if (count > 0) {
changeText[count - 1] = text
} else {
changeText[0] = text
}
recordings.value = changeText
}
const onTextFinished = (text) => {
const finishText = recordings.value || []
const count = finishText.length;
if (count > 0) {
finishText[count - 1] = text
} else {
finishText[0] = text
}
recordings.value = finishText
}
// 实现16000音频采样率
const resampleAudioBuffer = async (audioBuffer, targetSampleRate) => {
const numberOfChannels = audioBuffer.numberOfChannels;
const offlineContext = new OfflineAudioContext(
numberOfChannels,
audioBuffer.duration * targetSampleRate,
targetSampleRate
);
const bufferSource = offlineContext.createBufferSource();
bufferSource.buffer = audioBuffer;
bufferSource.connect(offlineContext.destination);
bufferSource.start();
const resampledAudioBuffer = await offlineContext.startRendering();
return resampledAudioBuffer;
}
const startCaptureAudio = async () => {
recordingStatus.value = '可以开始录音了!'
audioContext.value = new AudioContext();
const sampleRate = audioContext.value.sampleRate;
// console.log("Original sampleRate: ", sampleRate);
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const source = audioContext.value.createMediaStreamSource(stream);
audioProcessor.value = audioContext.value.createScriptProcessor(1024, 1, 1);
audioProcessor.value.onaudioprocess = async (event) => {
const inputBuffer = event.inputBuffer;
// Resample the buffer if needed
let resampledBuffer = inputBuffer;
if (sampleRate !== 16000) {
resampledBuffer = await resampleAudioBuffer(inputBuffer, 16000);
}
const pcmData = convertToPCM(resampledBuffer);
pcmBuffer = concatenateTypedArrays(pcmBuffer, new Int16Array(pcmData));
while (pcmBuffer.length * 2 >= 3200) {
const chunk = pcmBuffer.slice(0, 3200 / 2);
websocket.send(chunk.buffer);
pcmBuffer = pcmBuffer.slice(3200 / 2);
}
};
source.connect(audioProcessor.value);
audioProcessor.value.connect(audioContext.value.destination);
}
const prepareRecording = async () => {
recordingStatus.value = '正在连接阿里云...'
const APPKEY = import.meta.env.VITE_ALI_APP_KEY
const TOKEN = import.meta.env.VITE_ALI_TOKEN
const URL = `${import.meta.env.VITE_ALI_PATH}?token=${TOKEN}`
taskId.value = taskId.value == '' ? generateUniqueID() : taskId.value
websocket = new WebSocket(URL);
websocket.onopen = () => {
const instruction = {
"header": {
"appkey": APPKEY,
"message_id": generateUniqueID(),
// "task_id": generateUniqueID(),
"task_id": taskId.value,
"namespace": "SpeechTranscriber",
"name": "StartTranscription"
},
"payload": {
"format": "PCM",
"sample_rate": 16000,
"enable_intermediate_result": true,
"enable_punctuation_prediction": true,
"enable_inverse_text_normalization": true,
"max_sentence_silence": 1000,
"disfluency": true,
"speech_noise_threshold": 0.8,
"customization_id": "029df699e89344a19baf8d2d4bd6d903",
"vocabulary_id": "6ca11494525a43fe9b337e8db8ed522f"
}
}
if (websocket.readyState == 1) {
websocket.send(JSON.stringify(instruction));
}
};
websocket.onmessage = (e) => {
const ret = JSON.parse(e.data);
console.log(ret, 12313)
if (ret.header.name === 'TranscriptionResultChanged') {
onTextChanged(ret.payload.result);
} else if (ret.header.name === 'SentenceBegin') {
const beginText = recordings.value || [];
beginText.push("")
recordings.value = beginText;
} else if (ret.header.name === 'TranscriptionStarted') {
startCaptureAudio();
} else if (ret.header.name === 'SentenceEnd') {
console.log(ret.payload.result)
// 识别结果
onTextFinished(ret.payload.result);
stopRecording()
} else if (ret.header.name === 'TranscriptionCompleted') {
console.log('服务端已停止了语音转写', ret);
} else {
console.log("response: ", e)
prepareRecording()
}
};
}
const convertToPCM = (audioBuffer) => {
let float32Array = audioBuffer.getChannelData(0);
let pcmData = new Int16Array(float32Array.length);
for (let i = 0; i < float32Array.length; i++) {
let s = Math.max(-1, Math.min(1, float32Array[i]));
pcmData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
}
return pcmData.buffer;
}
const concatenateTypedArrays = (a, b) => {
const result = new a.constructor(a.length + b.length);
result.set(a);
result.set(b, a.length);
return result;
}
const stopRecording = () => {
recordingStatus.value = '已停止录音'
console.log('停止录音')
if (audioProcessor.value) {
audioProcessor.value.disconnect();
audioProcessor.value = null;
}
if (websocket) {
console.log('关闭长链接')
websocket.close();
websocket = null;
}
}
export { prepareRecording, stopRecording, taskId }
中间如有问题请私聊,懒了 不想写解释了 ,,ԾㅂԾ,,
需要注意的是周边环境嘈杂识别不准1.speech_noise_threshold简单除燥。