语音输入转文字

73 阅读3分钟
  1. 获取音频流
const mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });

MediaDevices.getUserMedia() 会提示用户给予使用媒体输入的许可,媒体输入会产生一个MediaStream,里面包含了请求的媒体类型的轨道。此流可以包含一个视频轨道(来自硬件或者虚拟视频源,比如相机、视频采集设备和屏幕共享服务等等)、一个音频轨道(同样来自硬件或虚拟音频源,比如麦克风、A/D 转换器等等),也可能是其他轨道类型。

它返回一个 Promise 对象,成功后会resolve回调一个 MediaStream 对象。若用户拒绝了使用权限,或者需要的媒体源不可用,promisereject回调一个 PermissionDeniedError 或者 NotFoundError

  1. 创建音频上下文
const audioCtx = new AudioContext();

AudioContext接口表示由链接在一起的音频模块构建的音频处理图,每个模块由一个AudioNode表示。音频上下文控制它包含的节点的创建和音频处理或解码的执行。在做任何其他操作之前,你需要创建一个AudioContext对象,因为所有事情都是在上下文中发生的。建议创建一个AudioContext对象并复用它,而不是每次初始化一个新的AudioContext对象,并且可以对多个不同的音频源和管道同时使用一个AudioContext对象。

  • 创建MediaStreamAudioSourceNode实例并传入音频流
const source = audioCtx.createMediaStreamSource(mediaStream)
  • 创建AudioWorkletNode实例,处理音频流
await audioCtx.audioWorklet.addModule('./record-processor.js')
const processor = new AudioWorkletNode(audioCtx, 'record-processor')
class RecordProcessor extends AudioWorkletProcessor {

isRunning = true


constructor() {

super();

this.port.onmessage = (event) => {

if (event.data.close) {

this.isRunning = false

}

};

}

  


static get parameterDescriptors() {

return []; // 无参数配置

}

  


process(inputs, outputs, parameters) {

if (!this.isRunning) {

return false;

}

const left = inputs[0][0] || [], right = inputs[0][1] || [];

const interleaved = new Float32Array(left.length * 2);

for (let i = 0; i < left.length; i++) {

interleaved[2 * i] = left[i];

interleaved[2 * i + 1] = right[i];

}

this.port.postMessage({

pcmData: interleaved,

left,

right,

});

return true;

}

}

  


registerProcessor("record-processor", RecordProcessor);Ï
  • 实例连接到audio graph
source.connect(processor.current)
processor.connect(audioCtx.destination)
  1. 与AudioWorkletNode实例通信,拿到pcm数据
processor.port.onmessage = async (e) => {
    const { pcmData } = e.data // 拿到pcm数据
}
  1. 将pcm数据转成wav
// 转换为 wav

const writeString = (view, offset, string) => {

for (let i = 0; i < string.length; i++) {

view.setUint8(offset + i, string.charCodeAt(i))

}

}

const encodeWAV = (samples, sampleRate, numChannels, bitsPerSample) => {

const blockAlign = (numChannels * bitsPerSample) / 8

const byteRate = sampleRate * blockAlign

// samples is expected to be a Float32Array here.

// dataSize is the total number of bytes for the PCM data.

const dataSize = samples.length * (bitsPerSample / 8);

  


const buffer = new ArrayBuffer(44 + dataSize)

const view = new DataView(buffer)

  


/* RIFF identifier */

writeString(view, 0, 'RIFF')

/* RIFF chunk length */

view.setUint32(4, 36 + dataSize, true) // true 表示使用小端字节序

/* RIFF type */

writeString(view, 8, 'WAVE')

/* format chunk identifier */

writeString(view, 12, 'fmt ')

/* format chunk length */

view.setUint32(16, 16, true)

/* sample format (raw) */

view.setUint16(20, 1, true) // 1 for PCM

/* channel count */

view.setUint16(22, numChannels, true)

/* sample rate */

view.setUint32(24, sampleRate, true)

/* byte rate (sample rate * block align) */

view.setUint32(28, byteRate, true)

/* block align (channel count * bytes per sample) */

view.setUint16(32, blockAlign, true)

/* bits per sample */

view.setUint16(34, bitsPerSample, true)

/* data chunk identifier */

writeString(view, 36, 'data')

/* data chunk length */

view.setUint32(40, dataSize, true)

  


// Write PCM samples

if (samples instanceof Float32Array && bitsPerSample === 16) {

// Convert Float32Array samples to 16-bit PCM

for (let i = 0; i < samples.length; i++) {

let s = samples[i];

// Clamp sample to [-1.0, 1.0]

s = Math.max(-1, Math.min(1, s));

// Convert to 16-bit signed integer (range -32768 to 32767)

const intValue = s < 0 ? s * 0x8000 : s * 0x7FFF;

view.setInt16(44 + i * 2, intValue, true); // Write as little-endian 16-bit integer (2 bytes per sample)

}

} else {

// Fallback: Assume samples is already in the correct PCM format as a byte buffer (e.g., Uint8Array).

// This preserves the original behavior for other cases.

// Ensure 'samples' is an ArrayBuffer or a TypedArray view on one.

const pcmData = samples instanceof ArrayBuffer ? new Uint8Array(samples) : new Uint8Array(samples.buffer, samples.byteOffset, samples.byteLength);

// The dataSize calculated earlier should correspond to pcmData.length if this path is taken.

for (let i = 0; i < pcmData.length; i++) {

view.setUint8(44 + i, pcmData[i]);

}

}

return buffer

}

function convertToWav(pcmData) {

const buffer = encodeWAV(pcmData, 44100, 2, 16)

return new Blob([buffer], { type: 'audio/wav' });

}

5.停止录音

source.disconnect() // 停止source

mediaStream.getAudioTracks().forEach((track) => track.stop()); // 停止所有轨道

processor.port.postMessage({ close: true })

processor.disconnect() // 停止音频线程

audioCtx.close() // 停止上下文