FFmpeg音频重采样基本流程

作者：从前慢现在也慢 | 2024-08-06 01:42:32

踩

FFmpeg音频重采样基本流程

流程概述

音频重采样的基本流程为：

申请重采样器上下文
设置重采样去上下文的参数
初始化重采样器
申请数据存放的缓冲区空间
进行重采样

注意，要先设置参数再对重采样器初始化

用到的API

SwrContext重采样器上下文的结构体。此结构是不透明的，这意味着，如果要设置选项，诸如av_opt_set等函数来设置。
struct SwrContext *swr_alloc();，申请重采样器上下文。
int av_opt_set(void *obj, const char *name, const char *val, int search_flags);
int av_opt_set_int(void *obj, const char *name, int64_t val, int search_flags);
int av_opt_set_chlayout(void *obj, const char *name, const AVChannelLayout *layout, int search_flags);
av_opt_set* 函数簇，这里仅列举几个。以av_opt_set为例，用于将给定name的obj字段设置为指定的val。第一个void* 的obj参数表示要设置的对象，第二个name参数表示要设置的字段名称，以字符串形式传入。例如obj为SwrContext* 对象，name为"in_sample_rate"就对应着SwrContext中的同名字段。中间的部分就为要设置的参数，最后的search_flags表示搜索搜索标志，一般设为0即可。
int swr_alloc_set_opts2(struct SwrContext **ps, const AVChannelLayout *out_ch_layout, enum AVSampleFormat out_sample_fmt, int out_sample_rate, const AVChannelLayout *in_ch_layout, enum AVSampleFormat in_sample_fmt, int in_sample_rate, int log_offset, void *log_ctx);如果还未分配则分配SwrContext，并设置/重置公共参数。就相当于alloc + set。
int swr_init(struct SwrContext *s);重采样去初始化。必须在设置过SwrContext 参数之后初始化。
int64_t av_rescale_rnd(int64_t a, int64_t b, int64_t c, enum AVRounding rnd)和int64_t av_rescale(int64_t a, int64_t b, int64_t c)都是用于计算的(a*b/c)，唯一的区别在于rnd可以设置向上取整向下取整等。
int av_samples_alloc_array_and_samples(uint8_t ***audio_data, int *linesize, int nb_channels, int nb_samples, enum AVSampleFormat sample_fmt, int align);
申请一个 data[nb_channels][ch_data] 的二维数组，所以audio_data要作为一个三级指针传进去。
void av_freep(void *ptr);释放av_samples_alloc_array_and_samples申请的data。av_freep即使传入null也是安全的。用法示例：
```
uint8_t *buf = av_malloc(16);
av_freep(&buf);
1
2
```
int64_t swr_get_delay(struct SwrContext *s, int64_t base);获取下一个输入样本相对于下一个输出样本所经历的延迟帧数。
int swr_convert(struct SwrContext *s, uint8_t * const *out, int out_count, const uint8_t * const *in , int in_count);
音频重采样，in和out是由av_samples_alloc_array_and_samples生成的data缓冲区。in_count和out_count则是对应的缓冲区大小的样本数。
int av_samples_get_buffer_size(int *linesize, int nb_channels, int nb_samples, enum AVSampleFormat sample_fmt, int align);
获取给定音频参数所需的缓冲区大小。

tips

swr是software resample的缩写
nb_samples样本数，表示每帧的每个通道中的采样点数。
重采样的三个关键参数：采样率、采样格式、声道布局。
音频的planner格式的数据是分在多个数组中的，例如左右声道的data[0]中存放L声道的数据，data[1]中存放R声道的数据。而交错模式的数据则是按照LRLR…的顺序统一放到data[0]中的。
av_freep要取地址的原因，是因为要将指针置空，仅此而已。
老版本的FFmpeg，例如在ffmpeg-4.2下，音频声道数只是一个单一的int型字段。而新版本的FFmpeg，以ffmpeg-7.0为例，则是将音频数据封装为一个AVChannelLayout结构体了。所以在设置 ‘layout’ 字段时，不能再用av_opt_set_int接口，而是要用av_opt_set_chlayout，name参数也要使用"in_chlayout"才行。

demo样例

重采样样例，参考：Examples - resample_audio.c

#include <iostream>
#include <fstream>
#include <string>
#include <cmath>
using namespace std;

extern "C"
{
#include <libavutil/opt.h>
#include <libavutil/channel_layout.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
}

/* format转字符串 */
string string_sample_fmt(enum AVSampleFormat sample_fmt)
{
    // 定义sample_fmt_entry结构体，同时定义了一个数组
    struct sample_fmt_entry
    {
        enum AVSampleFormat sample_fmt; const char *fmt_be, *fmt_le;
    } sample_fmt_entries[] = {
            { AV_SAMPLE_FMT_U8,  "u8",    "u8"    },
            { AV_SAMPLE_FMT_S16, "s16be", "s16le" },
            { AV_SAMPLE_FMT_S32, "s32be", "s32le" },
            { AV_SAMPLE_FMT_FLT, "f32be", "f32le" },
            { AV_SAMPLE_FMT_DBL, "f64be", "f64le" },};
    // 返回字符串
    const char* str_fmt = nullptr;
    int arr_len = FF_ARRAY_ELEMS(sample_fmt_entries);
    for (int i = 0; i < arr_len; i++)
    {
        auto entry = sample_fmt_entries[i];
        if (sample_fmt == entry.sample_fmt)
        {
            return AV_NE(entry.fmt_be, entry.fmt_le);
        }
    }
}

/**
 * Fill dst buffer with nb_samples, generated starting from t.
 * 交错模式，函数摘自：https://ffmpeg.org/doxygen/7.0/resample_audio_8c-example.html
 * sin曲线，t表示当前所在的相位，周期为一帧所持续的时间
 */
void fill_samples(double *dst, int nb_samples, int nb_channels, int sample_rate, double *t)
{
    int i, j;
    double tincr = 1.0 / sample_rate, *dstp = dst;
    const double c = 2 * M_PI * 440.0;

    /* generate sin tone with 440Hz frequency and duplicated channels */
    for (i = 0; i < nb_samples; i++) {
        *dstp = sin(c * *t);
        for (j = 1; j < nb_channels; j++)
            dstp[j] = dstp[0];
        dstp += nb_channels;
        *t += tincr;
    }
}

int main()
{
    /* 采样参数定义 */
    // 输入参数
    int src_sample_rate = 48000;
    enum AVSampleFormat src_sample_fmt = AV_SAMPLE_FMT_DBL;
    AVChannelLayout src_ch_layout = AV_CHANNEL_LAYOUT_STEREO; // 立体声
    // 输出参数
    int dst_sample_rate = 44100;
    enum AVSampleFormat dst_sample_fmt = AV_SAMPLE_FMT_S16;
    AVChannelLayout dst_ch_layout = AV_CHANNEL_LAYOUT_STEREO; // 立体声

    // 创建重采样器上下文（暂且认为不会失败）
    SwrContext *swr_ctx = swr_alloc();

    /* 参数设置（SwrContext字段设置） */
    // 输入参数
    check_optset(av_opt_set_int(swr_ctx, "in_sample_rate", src_sample_rate, 0), __LINE__);
    check_optset(av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", src_sample_fmt, 0), __LINE__);
    check_optset(av_opt_set_chlayout(swr_ctx, "in_chlayout", &src_ch_layout, 0), __LINE__);
    // 输出参数
    check_optset(av_opt_set_int(swr_ctx, "out_sample_rate", dst_sample_rate, 0), __LINE__);
    check_optset(av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", dst_sample_fmt, 0), __LINE__);
    check_optset(av_opt_set_chlayout(swr_ctx, "out_chlayout", &dst_ch_layout, 0), __LINE__);

    // 参数设置完成后，初始化上下文
    swr_init(swr_ctx);

    // 给输入源分配内存空间
    uint8_t **src_data = nullptr;
    int src_linesize;
    int src_nb_samples = 1024; // 每个通道的样本数
    av_samples_alloc_array_and_samples(&src_data, &src_linesize, src_ch_layout.nb_channels,
                                       src_nb_samples, src_sample_fmt, 0);

    // 给输出源分配内存空间
    uint8_t **dst_data;
    int dst_linesize;
    // 计算输出的信道样本数：a * b / c，AV_ROUND_UP表示向上取整
    int dst_nb_samples = av_rescale_rnd(src_nb_samples, dst_sample_rate, src_sample_rate, AV_ROUND_UP);
    // 分配空间
    av_samples_alloc_array_and_samples(&dst_data, &dst_linesize, dst_ch_layout.nb_channels,
                                       dst_nb_samples, dst_sample_fmt, 0);

    // 采样转换
    double t = 0; // 时间，以输入源的时间为基准
    int max_nb_samples = dst_nb_samples;
    string dst_file_name = "out.pcm";
    ofstream dst_file(dst_file_name, ios_base::out | ios_base::binary);
    while(t < 10)
    {
        // 生成输入源（模拟）
        fill_samples((double*)src_data[0], src_nb_samples, src_ch_layout.nb_channels, src_sample_rate, &t);
        // 获取延迟（dst音频相对src音频延迟的帧数）
        int64_t delay = swr_get_delay(swr_ctx, src_sample_rate);
        // 输出的信道样本数，a * b / c
        dst_nb_samples = av_rescale(delay + src_nb_samples, dst_sample_rate, src_sample_rate);
        // 如果输出缓冲区大小不够，重新申请空间
        if(dst_nb_samples > max_nb_samples)
        {
            // 重新申请空间
            av_freep(&dst_data[0]);
            av_samples_alloc(dst_data, &dst_linesize, dst_ch_layout.nb_channels,
                                   dst_nb_samples, dst_sample_fmt, 1);
            max_nb_samples = dst_nb_samples;
        }
        // 音频重采样
        int ret = swr_convert(swr_ctx, dst_data, dst_nb_samples,
                              (const uint8_t **)src_data, src_nb_samples);
        // 获取给定音频参数所需的缓冲区大小。
        int dst_buf_size = av_samples_get_buffer_size(&dst_linesize, dst_ch_layout.nb_channels,
                                                      ret, dst_sample_fmt, 1);
        // write
        dst_file.write((char*)dst_data[0], dst_buf_size);
    }

    // clear and exit
    // TODO
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

附录 - SwrContext结构体字段

版本：ffmpeg-7.0

struct SwrContext {
    const AVClass *av_class;                        ///< AVClass used for AVOption and av_log()
    int log_level_offset;                           ///< logging level offset
    void *log_ctx;                                  ///< parent logging context
    enum AVSampleFormat  in_sample_fmt;             ///< input sample format
    enum AVSampleFormat int_sample_fmt;             ///< internal sample format (AV_SAMPLE_FMT_FLTP or AV_SAMPLE_FMT_S16P)
    enum AVSampleFormat out_sample_fmt;             ///< output sample format
    AVChannelLayout used_ch_layout;                 ///< number of used input channels (mapped channel count if channel_map, otherwise in.ch_count)
    AVChannelLayout  in_ch_layout;                  ///< input channel layout
    AVChannelLayout out_ch_layout;                  ///< output channel layout
    int      in_sample_rate;                        ///< input sample rate
    int     out_sample_rate;                        ///< output sample rate
    int flags;                                      ///< miscellaneous flags such as SWR_FLAG_RESAMPLE
    float slev;                                     ///< surround mixing level
    float clev;                                     ///< center mixing level
    float lfe_mix_level;                            ///< LFE mixing level
    float rematrix_volume;                          ///< rematrixing volume coefficient
    float rematrix_maxval;                          ///< maximum value for rematrixing output
    int matrix_encoding;                            /**< matrixed stereo encoding */
    const int *channel_map;                         ///< channel index (or -1 if muted channel) map
    int engine;

    AVChannelLayout user_used_chlayout;             ///< User set used channel layout
    AVChannelLayout user_in_chlayout;               ///< User set input channel layout
    AVChannelLayout user_out_chlayout;              ///< User set output channel layout
    enum AVSampleFormat user_int_sample_fmt;        ///< User set internal sample format
    int user_dither_method;                         ///< User set dither method

    struct DitherContext dither;

    int filter_size;                                /**< length of each FIR filter in the resampling filterbank relative to the cutoff frequency */
    int phase_shift;                                /**< log2 of the number of entries in the resampling polyphase filterbank */
    int linear_interp;                              /**< if 1 then the resampling FIR filter will be linearly interpolated */
    int exact_rational;                             /**< if 1 then enable non power of 2 phase_count */
    double cutoff;                                  /**< resampling cutoff frequency (swr: 6dB point; soxr: 0dB point). 1.0 corresponds to half the output sample rate */
    int filter_type;                                /**< swr resampling filter type */
    double kaiser_beta;                                /**< swr beta value for Kaiser window (only applicable if filter_type == AV_FILTER_TYPE_KAISER) */
    double precision;                               /**< soxr resampling precision (in bits) */
    int cheby;                                      /**< soxr: if 1 then passband rolloff will be none (Chebyshev) & irrational ratio approximation precision will be higher */

    float min_compensation;                         ///< swr minimum below which no compensation will happen
    float min_hard_compensation;                    ///< swr minimum below which no silence inject / sample drop will happen
    float soft_compensation_duration;               ///< swr duration over which soft compensation is applied
    float max_soft_compensation;                    ///< swr maximum soft compensation in seconds over soft_compensation_duration
    float async;                                    ///< swr simple 1 parameter async, similar to ffmpegs -async
    int64_t firstpts_in_samples;                    ///< swr first pts in samples

    int resample_first;                             ///< 1 if resampling must come first, 0 if rematrixing
    int rematrix;                                   ///< flag to indicate if rematrixing is needed (basically if input and output layouts mismatch)
    int rematrix_custom;                            ///< flag to indicate that a custom matrix has been defined

    AudioData in;                                   ///< input audio data
    AudioData postin;                               ///< post-input audio data: used for rematrix/resample
    AudioData midbuf;                               ///< intermediate audio data (postin/preout)
    AudioData preout;                               ///< pre-output audio data: used for rematrix/resample
    AudioData out;                                  ///< converted output audio data
    AudioData in_buffer;                            ///< cached audio data (convert and resample purpose)
    AudioData silence;                              ///< temporary with silence
    AudioData drop_temp;                            ///< temporary used to discard output
    int in_buffer_index;                            ///< cached buffer position
    int in_buffer_count;                            ///< cached buffer length
    int resample_in_constraint;                     ///< 1 if the input end was reach before the output end, 0 otherwise
    int flushed;                                    ///< 1 if data is to be flushed and no further input is expected
    int64_t outpts;                                 ///< output PTS
    int64_t firstpts;                               ///< first PTS
    int drop_output;                                ///< number of output samples to drop
    double delayed_samples_fixup;                   ///< soxr 0.1.1: needed to fixup delayed_samples after flush has been called.

    struct AudioConvert *in_convert;                ///< input conversion context
    struct AudioConvert *out_convert;               ///< output conversion context
    struct AudioConvert *full_convert;              ///< full conversion context (single conversion for input and output)
    struct ResampleContext *resample;               ///< resampling context
    struct Resampler const *resampler;              ///< resampler virtual function table

    double matrix[SWR_CH_MAX][SWR_CH_MAX];          ///< floating point rematrixing coefficients
    float matrix_flt[SWR_CH_MAX][SWR_CH_MAX];       ///< single precision floating point rematrixing coefficients
    uint8_t *native_matrix;
    uint8_t *native_one;
    uint8_t *native_simd_one;
    uint8_t *native_simd_matrix;
    int32_t matrix32[SWR_CH_MAX][SWR_CH_MAX];       ///< 17.15 fixed point rematrixing coefficients
    uint8_t matrix_ch[SWR_CH_MAX][SWR_CH_MAX+1];    ///< Lists of input channels per output channel that have non zero rematrixing coefficients
    mix_1_1_func_type *mix_1_1_f;
    mix_1_1_func_type *mix_1_1_simd;

    mix_2_1_func_type *mix_2_1_f;
    mix_2_1_func_type *mix_2_1_simd;

    mix_any_func_type *mix_any_f;

    /* TODO: callbacks for ASM optimizations */
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

本文内容由网友自发贡献，转载请注明出处：【wpsshop博客】

FFmpeg音频重采样基本流程

目录

流程概述

用到的API

tips

demo样例

附录 - SwrContext结构体字段