FFmpeg/libavcodec/libaac_nextenc.c

/*
 * Libaac-next encoder (libxaac based)
 * Copyright (c) 2025 Wrapper
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

/**
 * @file
 * Interface to libaac-next encoder.
 */

#include <libaac.h>

#include "libavutil/channel_layout.h"
#include "libavutil/internal.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/log.h"
#include "libavutil/opt.h"
#include "libavutil/mem.h"
#include "avcodec.h"
#include "defs.h"
#include "audio_frame_queue.h"
#include "codec_internal.h"
#include "encode.h"
#include "libavutil/samplefmt.h"
#include "profiles.h"

typedef struct
{
    const AVClass *class;
    AACContext *encoder;
    int delay_sent;
    int flush_delay;

    int eld_v2;
    int esbr;
    int frame_length;
    int iq;
    int tns;

    AudioFrameQueue afq;
} libaacEncodeCTX;

static const AVOption aac_enc_options[] = {
    { "eld_v2", "Enable ELDv2 (LD-MPS extension for ELD stereo signals)", offsetof(libaacEncodeCTX, eld_v2), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
    { "esbr", "Enable the use of Enhanced SBR", offsetof(libaacEncodeCTX, esbr), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
    { "frame_length", "The desired frame length", offsetof(libaacEncodeCTX, frame_length), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 1024, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
    { "iq", "Inverse quantization", offsetof(libaacEncodeCTX, frame_length), AV_OPT_TYPE_INT, { .i64 = 2 }, 0, 2, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
    { "tns", "Temporal Noise Shaping", offsetof(libaacEncodeCTX, tns), AV_OPT_TYPE_INT, { .i64 = 1 }, 0, 1, AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_ENCODING_PARAM },
    FF_AAC_PROFILE_OPTS
    { NULL }
};

static const AVClass aac_enc_class = {
    .class_name = "libaac",
    .item_name = av_default_item_name,
    .option = aac_enc_options,
    .version = LIBAVUTIL_VERSION_INT,
};

static void aac_enc_error_handler(uint32_t errorCode, const char *section, const char *errorMsg, bool isFatal, void *handle) {
    AVCodecContext *ctx = (AVCodecContext *)handle;
    av_log(ctx, AV_LOG_ERROR, "%s: %s (0x%08X)\n", section, errorMsg, errorCode);
}

static av_cold int libaac_encode_init(AVCodecContext *avctx)
{
    libaacEncodeCTX *s = avctx->priv_data;
    AACSettings cfg = {0};

    /* number of channels */
    if (avctx->ch_layout.nb_channels < 1 || avctx->ch_layout.nb_channels > 6)
    {
        av_log(avctx, AV_LOG_ERROR, "encoding %d channel(s) is not allowed\n", avctx->ch_layout.nb_channels);
        return AVERROR(EINVAL);
    }

    cfg.sampleRate = avctx->sample_rate;
    cfg.noChannels = avctx->ch_layout.nb_channels;
    cfg.bitsPerSamples = avctx->sample_fmt == AV_SAMPLE_FMT_FLT ? 32 : 16;
    cfg.bitrate = avctx->bit_rate;
    cfg.adts = !(avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER);
    cfg.cutoff = avctx->cutoff;
    switch (avctx->profile) {
        case AV_PROFILE_AAC_LOW:
        case AV_PROFILE_UNKNOWN:
            cfg.profile = AAC_LC;
            break;

        case AV_PROFILE_AAC_HE:
            cfg.profile = AAC_HE;
            break;

        case AV_PROFILE_AAC_HE_V2:
            cfg.profile = AAC_HEV2;
            break;

        case AV_PROFILE_AAC_LD:
            cfg.profile = AAC_LD;
            break;

        case AV_PROFILE_AAC_ELD:
            cfg.profile = AAC_ELD;
            break;

        default:
            av_log(avctx, AV_LOG_ERROR, "unsupported profile, supported profiles are LC, HE, HEv2, LD and ELD\n");
            return AVERROR(EINVAL);
    }
    cfg.tns = s->tns;
    cfg.frameSize = s->frame_length;
    cfg.eSBR = s->esbr;
    cfg.iq = s->iq;

    cfg.errorHandleCtx = avctx;
    cfg.errorHandler = aac_enc_error_handler;

    s->encoder = aac_encode_open(cfg);

    if (!s->encoder)
    {
        return AVERROR(EINVAL);
    }

    avctx->frame_size = s->encoder->no_samples / avctx->ch_layout.nb_channels;
    avctx->initial_padding = s->encoder->inputDelay;
    s->flush_delay = s->encoder->inputDelay;

    av_log(avctx, AV_LOG_TRACE, "frame size: %d, initial delay: %d\n", avctx->frame_size, avctx->initial_padding);

    ff_af_queue_init(avctx, &s->afq);

    /* Set decoder specific info */
    avctx->extradata_size = 0;
    if (avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER)
    {
        avctx->extradata = av_mallocz(s->encoder->ascSize + AV_INPUT_BUFFER_PADDING_SIZE);

        if (!avctx->extradata)
        {
            return AVERROR(ENOMEM);
        }

        memcpy(avctx->extradata, s->encoder->asc, s->encoder->ascSize);
    }
    return 0;
}

static int libaac_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                const AVFrame *frame, int *got_packet)
{
    libaacEncodeCTX *s = avctx->priv_data;
    int ret;
    int discard_padding;

    if ((ret = ff_alloc_packet(avctx, pkt, s->encoder->max_out_bytes)) < 0)
        return ret;

    if (!frame)
    {
        av_log(avctx, AV_LOG_TRACE, "flush_delay: %d\n", s->flush_delay);

        if (s->flush_delay <= 0)
            return 0;

        /* Flushing */
        if ((ret = aac_encode(s->encoder, NULL, 0, pkt->data, (unsigned int *)&pkt->size)) < 0)
        {
            return AVERROR(EINVAL);
        }

        s->flush_delay -= avctx->frame_size;
    }
    else
    {
        /* Encoding */
        if ((ret = ff_af_queue_add(&s->afq, frame)) < 0)
            return ret;

        int encodeSize = av_get_bytes_per_sample(avctx->sample_fmt) * avctx->ch_layout.nb_channels * frame->nb_samples;
        av_log(avctx, AV_LOG_TRACE, "encode size: %d\n", encodeSize);

        if ((ret = aac_encode(s->encoder, frame->data[0], encodeSize, pkt->data, (unsigned int *)&pkt->size)) < 0)
        {
            return AVERROR(EINVAL);
        }
    }

    ff_af_queue_remove(&s->afq, avctx->frame_size, &pkt->pts, &pkt->duration);

    /* discard padding copied from fdkaac encoder */
    discard_padding = avctx->frame_size - pkt->duration;

    // Check if subtraction resulted in an overflow
    if ((discard_padding < avctx->frame_size) != (pkt->duration > 0))
    {
        av_log(avctx, AV_LOG_ERROR, "discard padding overflow\n");
        return AVERROR(EINVAL);
    }

    if ((!s->delay_sent && avctx->initial_padding > 0) || discard_padding > 0)
    {
        uint8_t *side_data =
            av_packet_new_side_data(pkt, AV_PKT_DATA_SKIP_SAMPLES, 10);
        if (!side_data)
            return AVERROR(ENOMEM);
        if (!s->delay_sent)
        {
            AV_WL32(side_data, avctx->initial_padding);
            s->delay_sent = 1;
        }
        AV_WL32(side_data + 4, discard_padding);
    }

    pkt->flags |= AV_PKT_FLAG_KEY;
    *got_packet = 1;
    return 0;
}

static void libaac_encode_flush(AVCodecContext *avctx)
{
    libaacEncodeCTX *s = avctx->priv_data;
    uint8_t sink_null[32768];
    int64_t pts, duration;
    uint32_t out_bytes;

    av_log(avctx, AV_LOG_TRACE, "encoder flush\n");
    ff_af_queue_remove(&s->afq, s->afq.frame_count, &pts, &duration);
    aac_encode(s->encoder, NULL, 0, sink_null, &out_bytes);
}

static av_cold int libaac_encode_close(AVCodecContext *avctx)
{
    libaacEncodeCTX *s = avctx->priv_data;

    if (s->encoder)
        aac_encode_close(s->encoder);

    ff_af_queue_close(&s->afq);

    return 0;
}

static const FFCodecDefault defaults[] = {
    {"b", "128000"},
    {NULL}};

static const AVProfile libaac_profiles[] = {
    { AV_PROFILE_AAC_LOW,   "LC"       },
    { AV_PROFILE_AAC_HE,    "HE-AAC"   },
    { AV_PROFILE_AAC_HE_V2, "HE-AACv2" },
    { AV_PROFILE_AAC_LD,    "LD"       },
    { AV_PROFILE_AAC_ELD,   "ELD"      },
    {AV_PROFILE_UNKNOWN},
};

static const int aac_sample_rates[] = {
    96000, 88200, 64000, 48000, 44100, 32000,
    24000, 22050, 16000, 12000, 11025, 8000, 0
};

static const AVChannelLayout aac_ch_layouts[6] = {
    AV_CHANNEL_LAYOUT_MONO,
    AV_CHANNEL_LAYOUT_STEREO,
    AV_CHANNEL_LAYOUT_SURROUND,
    AV_CHANNEL_LAYOUT_4POINT0,
    AV_CHANNEL_LAYOUT_5POINT0_BACK,
    AV_CHANNEL_LAYOUT_5POINT1_BACK,
};

const FFCodec ff_libaac_next_encoder = {
    .p.name = "libaac",
    CODEC_LONG_NAME("custom libxaac-based AAC encoder"),
    .p.type = AVMEDIA_TYPE_AUDIO,
    .p.id = AV_CODEC_ID_AAC,
    .p.capabilities = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_ENCODER_FLUSH,
    .caps_internal = FF_CODEC_CAP_NOT_INIT_THREADSAFE,
    .priv_data_size = sizeof(libaacEncodeCTX),
    .init = libaac_encode_init,
    FF_CODEC_ENCODE_CB(libaac_encode_frame),
    .flush = libaac_encode_flush,
    .close = libaac_encode_close,
    CODEC_SAMPLEFMTS(AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S16),
    .p.priv_class = &aac_enc_class,
    .defaults = defaults,
    .p.profiles = libaac_profiles,
    CODEC_SAMPLERATES_ARRAY(aac_sample_rates),
    .p.wrapper_name = "libaac",
    CODEC_CH_LAYOUTS_ARRAY(aac_ch_layouts),
};