linux下lame&alsa进行音频流操作(八)用ffmpeg将mp3转为wav

360 阅读3分钟

1. 利用fffmpeg将mp3转为pcm并在pcm数据加上wav头就是一个完整的wav文件

2. 代码

#include "utils.h"
#include <libavutil/avutil.h>
#include <libavutil/attributes.h>
#include <libavutil/opt.h>
#include <libavutil/mathematics.h>
#include <libavutil/imgutils.h>
#include <libavutil/samplefmt.h>
#include <libavutil/timestamp.h>
#include <libavformat/avformat.h>
#include <libavcodec/avcodec.h>
#include <libswscale/swscale.h>
#include <libavutil/mathematics.h>
#include <libswresample/swresample.h>
#include <libavutil/channel_layout.h>
#include <libavutil/common.h>
#include <libavformat/avio.h>
#include <libavutil/file.h>
#include <libswresample/swresample.h>

#define AVCODEC_MAX_AUDIO_FRAME_SIZE 192000
 //下面这四个结构体是为了分析wav头的
typedef struct {
    u_int magic;      /* 'RIFF' */
    u_int length;     /* filelen */
    u_int type;       /* 'WAVE' */
} WaveHeader;

typedef struct {
    u_short format;       /* see WAV_FMT_* */
    u_short channels;
    u_int sample_fq;      /* frequence of sample */
    u_int byte_p_sec;
    u_short byte_p_spl;   /* samplesize; 1 or 2 bytes */
    u_short bit_p_spl;    /* 8, 12 or 16 bit */
} WaveFmtBody;

typedef struct {
    u_int type;        /* 'data' */
    u_int length;      /* samplecount */
} WaveChunkHeader;

#define COMPOSE_ID(a,b,c,d) ((a) | ((b)<<8) | ((c)<<16) | ((d)<<24))
#define WAV_RIFF COMPOSE_ID('R','I','F','F')
#define WAV_WAVE COMPOSE_ID('W','A','V','E')
#define WAV_FMT COMPOSE_ID('f','m','t',' ')
#define WAV_DATA COMPOSE_ID('d','a','t','a')
int insert_wave_header(FILE* fp, long data_len)
{
    int len;
    WaveHeader* header;
    WaveChunkHeader* chunk;
    WaveFmtBody* body;
    
    fseek(fp, 0, SEEK_SET);        //写到wav文件的开始处
    
    len = sizeof(WaveHeader)+sizeof(WaveFmtBody)+sizeof(WaveChunkHeader)*2;
    char* buf = (char*)malloc(len);
    header = (WaveHeader*)buf;
    header->magic = WAV_RIFF;
    header->length = data_len + sizeof(WaveFmtBody)+sizeof(WaveChunkHeader)*2 + 4;
    header->type = WAV_WAVE;
   
    chunk = buf+sizeof(WaveHeader);
    chunk->type = WAV_FMT;
    chunk->length = 16;

    body = buf+sizeof(WaveHeader)+sizeof(WaveChunkHeader);
    body->format = (u_short)0x0001;      //编码方式为pcm
    body->channels = (u_short)0x02;      //声道数为2
    body->sample_fq = 44100;             //采样频率为44.1k
    body->byte_p_sec = 176400;           //每秒所需字节数 44100*2*2=采样频率*声道*采样位数
    body->byte_p_spl = (u_short)0x4;     //对齐无意义
    body->bit_p_spl = (u_short)16;       //采样位数16bit=2Byte


    chunk = buf+sizeof(WaveHeader)+sizeof(WaveChunkHeader)+sizeof(WaveFmtBody);
    chunk->type = WAV_DATA;
    chunk->length = data_len;
   
    fwrite(buf, 1, len, fp);
    free(buf);
    return 0;
}

typedef struct {
    int videoindex;
    int sndindex;
    AVFormatContext* pFormatCtx;
    AVCodecContext* sndCodecCtx;
    AVCodec* sndCodec;
    SwrContext *swr_ctx;
    DECLARE_ALIGNED(16,uint8_t,audio_buf) [AVCODEC_MAX_AUDIO_FRAME_SIZE * 4];
}AudioState;

int init_ffmpeg(AudioState* is, char* filepath)
{
    int i=0;
    int ret;
    is->sndindex = -1;
    if(NULL == filepath)
    {
        dbmsg("input file is NULL");
        return -1;
    }
    avcodec_register_all();
    avfilter_register_all();
    av_register_all();

    is->pFormatCtx = avformat_alloc_context();

    if(avformat_open_input(&is->pFormatCtx, filepath, NULL, NULL)!=0)
        return -1;

    if(avformat_find_stream_info(is->pFormatCtx, NULL)<0)
        return -1;
    av_dump_format(is->pFormatCtx,0, 0, 0);
    is->videoindex = av_find_best_stream(is->pFormatCtx, AVMEDIA_TYPE_VIDEO, is->videoindex, -1, NULL, 0);
    is->sndindex = av_find_best_stream(is->pFormatCtx, AVMEDIA_TYPE_AUDIO,is->sndindex, is->videoindex, NULL, 0);
    dbmsg("videoindex=%d, sndindex=%d", is->videoindex, is->sndindex);
    if(is->sndindex != -1)
    {
        is->sndCodecCtx = is->pFormatCtx->streams[is->sndindex]->codec;
        is->sndCodec = avcodec_find_decoder(is->sndCodecCtx->codec_id);
        if(is->sndCodec == NULL)
        {
            dbmsg("Codec not found");
            return -1;
        }
        if(avcodec_open2(is->sndCodecCtx, is->sndCodec, NULL) < 0)
            return -1;
    }
    return 0;
}

int main(int argc, char **argv)
{
    int ret;
    FILE* fp;
    int file_data_size = 0;                //这儿注意一个问题: 变量用时一定要初始化,否则会出现异常
    int len1,len2, data_size, got_frame;
    AVPacket *packet = av_mallocz(sizeof(AVPacket));
    AVFrame *frame = av_frame_alloc();
    AudioState* is = (AudioState*) av_mallocz(sizeof(AudioState));
    uint8_t *out[] = { is->audio_buf };
    fp = fopen("./test.wav", "wb+");
    len1 = sizeof(WaveHeader)+sizeof(WaveFmtBody)+sizeof(WaveChunkHeader)*2;
    fseek(fp,len1, SEEK_SET);      //在写之前先预留出wav的header,即44个字节
    dbmsg("len1=%d",len1);
    
    //第1步初始化ffmpeg,并用ffmpeg解码,最后转为pcm格式
    if( (ret=init_ffmpeg(is, argv[1])) != 0)            //1.1 初始化ffmpeg
    {
        dbmsg("init_ffmpeg error");
        return -1;
    }
    while( (av_read_frame(is->pFormatCtx, packet)>=0) )    //1.2 循环读取mp3文件中的数据帧
    {
        if(packet->stream_index != is->sndindex)
            continue;
        if((ret=avcodec_decode_audio4(is->sndCodecCtx, frame, &got_frame, packet)) < 0) //1.3 解码数据帧
        {
            dbmsg("file eof");
            break;
        }

        if(got_frame <= 0) /* No data yet, get more frames */
            continue;
        data_size = av_samples_get_buffer_size(NULL, is->sndCodecCtx->channels, frame->nb_samples, is->sndCodecCtx->sample_fmt, 1);
        //1.4下面将ffmpeg解码后的数据帧转为我们需要的数据(关于"需要的数据"下面有解释)
        if(NULL==is->swr_ctx)
        {
            if(is->swr_ctx != NULL)
                swr_free(&is->swr_ctx);
            dbmsg("frame: channnels=%d,format=%d, sample_rate=%d", frame->channels, frame->format, frame->sample_rate);
            is->swr_ctx = swr_alloc_set_opts(NULL, AV_CH_LAYOUT_STEREO, AV_SAMPLE_FMT_S16, 44100, av_get_default_channel_layout(frame->channels), frame->format, frame->sample_rate, 0, NULL);
            if(is->swr_ctx == NULL)
            {
                dbmsg("swr_ctx == NULL");
            }
            swr_init(is->swr_ctx);
        }
        len2 = swr_convert(is->swr_ctx, out, 44100,(const uint8_t **)frame->extended_data, frame->nb_samples);
        file_data_size += len2;
        //1.5 数据格式转换完成后就写到文件中
        fwrite((short *)is->audio_buf, sizeof(short), (size_t) len2* 2, fp);
    }
    file_data_size *= 4;
    dbmsg("file_data_size=%d", file_data_size);
    //第2步添加上wav的头
    ret = insert_wave_header(fp, file_data_size);
    av_free_packet(packet);
    av_free(frame);
    avcodec_close(is->sndCodecCtx);
    avformat_close_input(&is->pFormatCtx);
    fclose(fp);
    return 0;
}

3.运行结果

cong@msi:/work/ffmpeg/test/alsa/testalsa/5mp3towav$ make run
export LD_LIBRARY_PATH=/work/ffmpeg/out/lib/ \
    && ./mp3towav /work/ffmpeg/test/resource//test.mp3
mp3towav.c:main[150]: len1=44
[mp3 @ 0x14d3620] Skipping 0 bytes of junk at 197687.
libavutil/crc.c:av_crc_init[313]:
[mp3 @ 0x14d3620] Estimating duration from bitrate, this may be inaccurate
Input #0, mp3, from '(null)':
  Metadata:
    artist : 佚名
    title : 法国国歌 马赛曲
    TYER : 2013-10-26
  Duration: 00:03:28.20, start: 0.000000, bitrate: 199 kb/s
    Stream #0:0: Audio: mp3, 44100 Hz, stereo, s16p, 192 kb/s
    Stream #0:1: Video: mjpeg, yuvj420p(pc, bt470bg/unknown/unknown), 600x600 [SAR 1:1 DAR 1:1], 90k tbr, 90k tbn, 90k tbc
    Metadata:
      title : e
      comment : Cover (front)
mp3towav.c:init_ffmpeg[120]: videoindex=-1381258232, sndindex=0
mp3towav.c:main[173]: frame: channnels=2,format=6, sample_rate=44100
mp3towav.c:main[186]: file_data_size=36725760

ls查看

cong@msi:/work/ffmpeg/test/alsa/testalsa/5mp3towav$ ls -l
total 36064
-rw-rw-r-- 1 cong cong 885 Sep 11 11:25 Makefile
-rwxrwxr-x 1 cong cong 64126 Sep 11 11:44 mp3towav
-rw-rw-r-- 1 cong cong 6183 Sep 11 11:24 mp3towav.c
-rw-rw-r-- 1 cong cong 115344 Sep 11 11:44 mp3towav.o
-rw-rw-r-- 1 cong cong 36725804 Sep 11 11:44 test.wav
-rw-rw-r-- 1 cong cong 333 Sep 9 11:31 utils.h

4. 说明

mp3towav.c:main[173]: AV_CH_LAYOUT_STEREO=3, AV_SAMPLE_FMT_S16=1, freq=44100
mp3towav.c:main[174]: frame: channnels=2, default_layout=3, format=6, sample_rate=44100

ffmpeg中:include/libavutil/samplefmt.h
enum AVSampleFormat {
    AV_SAMPLE_FMT_NONE = -1,
    AV_SAMPLE_FMT_U8, ///< unsigned 8 bits
    AV_SAMPLE_FMT_S16, ///< signed 16 bits    --> 1 这个是pcm的数据格式
    AV_SAMPLE_FMT_S32, ///< signed 32 bits
    AV_SAMPLE_FMT_FLT, ///< float
    AV_SAMPLE_FMT_DBL, ///< double

    AV_SAMPLE_FMT_U8P, ///< unsigned 8 bits, planar
    AV_SAMPLE_FMT_S16P, ///< signed 16 bits, planar  -->6 这个是ffmepg解码之后的数据格式
    AV_SAMPLE_FMT_S32P, ///< signed 32 bits, planar
    AV_SAMPLE_FMT_FLTP, ///< float, planar
    AV_SAMPLE_FMT_DBLP, ///< double, planar

    AV_SAMPLE_FMT_NB ///< Number of sample formats. DO NOT USE if linking dynamically
};

interleaved -->理解为交叉存取 --> AV_SAMPLE_FMT_S16是两个声道的声音是交叉存储的
plannar–> 理解为平面存取 --> AV_SAMPLE_FMT_S16P是先存1个声道的数据再存另一个声道的数据

AV_SAMPLE_FMT_S16P is planar signed 16 bit audio, i.e. 2 bytes for each sample which is same for AV_SAMPLE_FMT_S16.

The only difference is in AV_SAMPLE_FMT_S16 samples of each channel are interleaved i.e. if you have two channel audio then the samples buffer will look like

c1 c1 c2 c2 c1 c1 c2 c2… -->AV_SAMPLE_FMT_S16的数据组织方式

where c1 is a sample for channel1 and c2 is sample for channel2.

while for one frame of planar audio you will have something like

c1 c1 c1 c1 … c2 c2 c2 c2 … -->AV_SAMPLE_FMT_S16P的数据组织方式

now how is it stored in AVFrame:

for planar audio:

data[i] will contain the data of channel i (assuming channel 0 is first channel).

however if you have more channels then 8 then data for rest of the channels can be found in extended_data attribute of AVFrame.

for non-planar audio

data[0] will contain the data for all channels in an interleaved manner.

参考文章:
What is the difference between AV_SAMPLE_FMT_S16P and AV_SAMPLE_FMT_S16?