libavc/encoder/ih264e_intra_modes_eval.c
Martin Storsjo 9113f5615c Use a separate field for the chroma stride
When both luma and chroma are copied to the local buffer (either
due to yuv format conversion, or due to padding), they have got
the same stride, but if chroma is copied while luma is used directly
from the input buffer, they might have different strides.

Therefore add a separate field for chroma stride.

This commit only adds the field, while it still has got the same
value as before.

Change-Id: I0dce97ad4d91cd1d9aba4b4472c6a0de45a314bc
2015-06-25 08:25:49 -07:00

2299 lines
84 KiB
C

/******************************************************************************
*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
*******************************************************************************
* @file
* ih264e_intra_modes_eval.c
*
* @brief
* This file contains definitions of routines that perform rate distortion
* analysis on a macroblock if they are to be coded as intra.
*
* @author
* ittiam
*
* @par List of Functions:
* - ih264e_derive_neighbor_availability_of_mbs()
* - ih264e_derive_ngbr_avbl_of_mb_partitions()
* - ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff()
* - ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff()
* - ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff()
* - ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton()
* - ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff()
* - ih264e_evaluate_intra16x16_modes()
* - ih264e_evaluate_intra4x4_modes()
* - ih264e_evaluate_intra_chroma_modes()
*
* @remarks
* None
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
/* System include files */
#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <assert.h>
/* User include files */
#include "ih264e_config.h"
#include "ih264_typedefs.h"
#include "ih264e_defs.h"
#include "iv2.h"
#include "ive2.h"
#include "ih264_debug.h"
#include "ih264_defs.h"
#include "ih264_macros.h"
#include "ih264_intra_pred_filters.h"
#include "ih264_structs.h"
#include "ih264_common_tables.h"
#include "ih264_trans_quant_itrans_iquant.h"
#include "ih264_inter_pred_filters.h"
#include "ih264_mem_fns.h"
#include "ih264_padding.h"
#include "ih264_deblk_edge_filters.h"
#include "ih264_cabac_tables.h"
#include "ime_distortion_metrics.h"
#include "ih264e_error.h"
#include "ih264e_bitstream.h"
#include "ime_defs.h"
#include "ime_structs.h"
#include "irc_cntrl_param.h"
#include "irc_frame_info_collector.h"
#include "ih264e_rate_control.h"
#include "ih264e_cabac_structs.h"
#include "ih264e_structs.h"
#include "ih264e_intra_modes_eval.h"
#include "ih264e_globals.h"
#include "ime_platform_macros.h"
/*****************************************************************************/
/* Function Definitions */
/*****************************************************************************/
/**
******************************************************************************
*
* @brief
* derivation process for macroblock availability
*
* @par Description
* Calculates the availability of the left, top, topright and topleft macroblocks.
*
* @param[in] ps_proc_ctxt
* pointer to proc context (handle)
*
* @remarks Based on section 6.4.5 in H264 spec
*
* @return none
*
******************************************************************************
*/
void ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t *ps_proc)
{
UWORD8 *pu1_slice_idx_curr = ps_proc->pu1_slice_idx;
UWORD8 *pu1_slice_idx_b;
UWORD8 *pu1_slice_idx_a;
UWORD8 *pu1_slice_idx_c;
UWORD8 *pu1_slice_idx_d;
block_neighbors_t *ps_ngbr_avbl;
WORD32 i4_mb_x, i4_mb_y;
WORD32 i4_wd_mbs;
i4_mb_x = ps_proc->i4_mb_x;
i4_mb_y = ps_proc->i4_mb_y;
i4_wd_mbs = ps_proc->i4_wd_mbs;
pu1_slice_idx_curr += (i4_mb_y * i4_wd_mbs) + i4_mb_x;
pu1_slice_idx_a = pu1_slice_idx_curr - 1;
pu1_slice_idx_b = pu1_slice_idx_curr - i4_wd_mbs;
pu1_slice_idx_c = pu1_slice_idx_b + 1;
pu1_slice_idx_d = pu1_slice_idx_b - 1;
ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
/**********************************************************************/
/* The macroblock is marked as available, unless one of the following */
/* conditions is true in which case the macroblock shall be marked as */
/* not available. */
/* 1. mbAddr < 0 */
/* 2 mbAddr > CurrMbAddr */
/* 3. the macroblock with address mbAddr belongs to a different slice */
/* than the macroblock with address CurrMbAddr */
/**********************************************************************/
/* left macroblock availability */
if (i4_mb_x == 0)
{ /* macroblocks along first column */
ps_ngbr_avbl->u1_mb_a = 0;
}
else
{ /* macroblocks belong to same slice? */
if (*pu1_slice_idx_a != *pu1_slice_idx_curr)
ps_ngbr_avbl->u1_mb_a = 0;
else
ps_ngbr_avbl->u1_mb_a = 1;
}
/* top macroblock availability */
if (i4_mb_y == 0)
{ /* macroblocks along first row */
ps_ngbr_avbl->u1_mb_b = 0;
}
else
{ /* macroblocks belong to same slice? */
if (*pu1_slice_idx_b != *pu1_slice_idx_curr)
ps_ngbr_avbl->u1_mb_b = 0;
else
ps_ngbr_avbl->u1_mb_b = 1;
}
/* top right macroblock availability */
if (i4_mb_x == i4_wd_mbs-1 || i4_mb_y == 0)
{ /* macroblocks along last column */
ps_ngbr_avbl->u1_mb_c = 0;
}
else
{ /* macroblocks belong to same slice? */
if (*pu1_slice_idx_c != *pu1_slice_idx_curr)
ps_ngbr_avbl->u1_mb_c = 0;
else
ps_ngbr_avbl->u1_mb_c = 1;
}
/* top left macroblock availability */
if (i4_mb_x == 0 || i4_mb_y == 0)
{ /* macroblocks along first column */
ps_ngbr_avbl->u1_mb_d = 0;
}
else
{ /* macroblocks belong to same slice? */
if (*pu1_slice_idx_d != *pu1_slice_idx_curr)
ps_ngbr_avbl->u1_mb_d = 0;
else
ps_ngbr_avbl->u1_mb_d = 1;
}
}
/**
******************************************************************************
*
* @brief
* derivation process for subblock/partition availability
*
* @par Description
* Calculates the availability of the left, top, topright and topleft subblock
* or partitions.
*
* @param[in] ps_proc_ctxt
* pointer to macroblock context (handle)
*
* @param[in] i1_pel_pos_x
* column position of the pel wrt the current block
*
* @param[in] i1_pel_pos_y
* row position of the pel in wrt current block
*
* @remarks Assumptions: before calling this function it is assumed that
* the neighbor availability of the current macroblock is already derived.
* Based on table 6-3 of H264 specification
*
* @return availability status (yes or no)
*
******************************************************************************
*/
UWORD8 ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t *ps_ngbr_avbl,
WORD8 i1_pel_pos_x,
WORD8 i1_pel_pos_y)
{
UWORD8 u1_neighbor_avail=0;
/**********************************************************************/
/* values of i1_pel_pos_x in the range 0-15 inclusive correspond to */
/* various columns of a macroblock */
/* */
/* values of i1_pel_pos_y in the range 0-15 inclusive correspond to */
/* various rows of a macroblock */
/* */
/* other values of i1_pel_pos_x & i1_pel_pos_y represents elements */
/* outside the bound of an mb ie., represents its neighbors. */
/**********************************************************************/
if (i1_pel_pos_x < 0)
{ /* column(-1) */
if (i1_pel_pos_y < 0)
{ /* row(-1) */
u1_neighbor_avail = ps_ngbr_avbl->u1_mb_d; /* current mb topleft availability */
}
else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
{ /* all rows of a macroblock */
u1_neighbor_avail = ps_ngbr_avbl->u1_mb_a; /* current mb left availability */
}
else /* if (i1_pel_pos_y >= 16) */
{ /* rows(+16) */
u1_neighbor_avail = 0; /* current mb bottom left availability */
}
}
else if (i1_pel_pos_x >= 0 && i1_pel_pos_x < 16)
{ /* all columns of a macroblock */
if (i1_pel_pos_y < 0)
{ /* row(-1) */
u1_neighbor_avail = ps_ngbr_avbl->u1_mb_b; /* current mb top availability */
}
else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
{ /* all rows of a macroblock */
u1_neighbor_avail = 1; /* current mb availability */
/* availability of the partition is dependent on the position of the partition inside the mb */
/* although the availability is declared as 1 in all cases these needs to be corrected somewhere else and this is not done in here */
}
else /* if (i1_pel_pos_y >= 16) */
{ /* rows(+16) */
u1_neighbor_avail = 0; /* current mb bottom availability */
}
}
else if (i1_pel_pos_x >= 16)
{ /* column(+16) */
if (i1_pel_pos_y < 0)
{ /* row(-1) */
u1_neighbor_avail = ps_ngbr_avbl->u1_mb_c; /* current mb top right availability */
}
else /* if (i1_pel_pos_y >= 0) */
{ /* all other rows */
u1_neighbor_avail = 0; /* current mb right & bottom right availability */
}
}
return u1_neighbor_avail;
}
/**
******************************************************************************
*
* @brief
* evaluate best intra 16x16 mode (rate distortion opt off)
*
* @par Description
* This function evaluates all the possible intra 16x16 modes and finds the mode
* that best represents the macro-block (least distortion) and occupies fewer
* bits in the bit-stream.
*
* @param[in] ps_proc_ctxt
* pointer to process context (handle)
*
* @remarks
* Ideally the cost of encoding a macroblock is calculated as
* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
* input block and the reconstructed block and rate is the number of bits taken
* to place the macroblock in the bit-stream. In this routine the rate does not
* exactly point to the total number of bits it takes, rather it points to header
* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
* and residual bits fall in to texture bits the number of bits taken to encoding
* mbtype is considered as rate, we compute cost. Further we will approximate
* the distortion as the deviation b/w input and the predicted block as opposed
* to input and reconstructed block.
*
* NOTE: As per the Document JVT-O079, for intra 16x16 macroblock,
* the SAD and cost are one and the same.
*
* @return none
*
******************************************************************************
*/
void ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
{
/* Codec Context */
codec_t *ps_codec = ps_proc->ps_codec;
/* SAD(distortion metric) of an 8x8 block */
WORD32 i4_mb_distortion = INT_MAX, i4_mb_distortion_least = INT_MAX;
/* lambda */
UWORD32 u4_lambda = ps_proc->u4_lambda;
/* cost = distortion + lambda*rate */
WORD32 i4_mb_cost= INT_MAX, i4_mb_cost_least = INT_MAX;
/* intra mode */
UWORD32 u4_intra_mode, u4_best_intra_16x16_mode = DC_I16x16;
/* neighbor pels for intra prediction */
UWORD8 *pu1_ngbr_pels_i16 = ps_proc->au1_ngbr_pels;
/* neighbor availability */
WORD32 i4_ngbr_avbl;
/* pointer to src macro block */
UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
/* pointer to prediction macro block */
UWORD8 *pu1_pred_mb_intra_16x16 = ps_proc->pu1_pred_mb_intra_16x16;
UWORD8 *pu1_pred_mb_intra_16x16_plane = ps_proc->pu1_pred_mb_intra_16x16_plane;
/* strides */
WORD32 i4_src_strd = ps_proc->i4_src_strd;
WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
/* pointer to neighbors left, top, topleft */
UWORD8 *pu1_mb_a = pu1_ref_mb - 1;
UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd;
UWORD8 *pu1_mb_d = pu1_mb_b - 1;
/* valid intra modes map */
UWORD32 u4_valid_intra_modes;
/* lut for valid intra modes */
const UWORD8 u1_valid_intra_modes[8] = {4, 6, 12, 14, 5, 7, 13, 15};
/* temp var */
UWORD32 i, u4_enable_fast_sad = 0, offset = 0;
/* init temp var */
if (ps_proc->i4_slice_type != ISLICE)
{
/* Offset for MBtype */
offset = (ps_proc->i4_slice_type == PSLICE) ? 5 : 23;
u4_enable_fast_sad = ps_proc->s_me_ctxt.u4_enable_fast_sad;
}
/* locating neighbors that are available for prediction */
/* TODO : update the neighbor availability information basing on constrained intra pred information */
/* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines
* basing on neighbors available and hence evade the computation of neighbor availability totally. */
/* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1);
ps_proc->i4_ngbr_avbl_16x16_mb = i4_ngbr_avbl;
/* gather prediction pels from the neighbors, if particular set is not available
* it is set to zero*/
/* left pels */
if (ps_proc->ps_ngbr_avbl->u1_mb_a)
{
for(i = 0; i < 16; i++)
pu1_ngbr_pels_i16[16-1-i] = pu1_mb_a[i * i4_rec_strd];
}
else
{
ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16,0,MB_SIZE);
}
/* top pels */
if (ps_proc->ps_ngbr_avbl->u1_mb_b)
{
ps_codec->pf_mem_cpy_mul8(pu1_ngbr_pels_i16+16+1,pu1_mb_b,16);
/*for(i = 0; i < 16; i++)
pu1_ngbr_pels_i16[16+1+i] = pu1_mb_b[i];*/
}
else
{
ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16+16+1,0,MB_SIZE);
}
/* topleft pels */
if (ps_proc->ps_ngbr_avbl->u1_mb_d)
pu1_ngbr_pels_i16[16] = *pu1_mb_d;
else
pu1_ngbr_pels_i16[16] = 0;
/* set valid intra modes for evaluation */
// u4_valid_intra_modes = 15;
//// ih264e_filter_intra16x16modes(pu1_mb_curr, i4_src_strd, &u4_valid_intra_modes);
// if (!ps_proc->ps_ngbr_avbl->u1_mb_a)
// u4_valid_intra_modes &= ~(1 << HORZ_I16x16);
// if (!ps_proc->ps_ngbr_avbl->u1_mb_b)
// u4_valid_intra_modes &= ~(1 << VERT_I16x16);
//// if (!ps_proc->ps_ngbr_avbl->u1_mb_a || !ps_proc->ps_ngbr_avbl->u1_mb_b || !ps_proc->ps_ngbr_avbl->u1_mb_d)
// if (i4_ngbr_avbl != 7)
// u4_valid_intra_modes &= ~(1 << PLANE_I16x16);
u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST)
u4_valid_intra_modes &= ~(1 << PLANE_I16x16);
/* evaluate b/w HORZ_I16x16, VERT_I16x16 & DC_I16x16 */
ps_codec->pf_ih264e_evaluate_intra16x16_modes(pu1_curr_mb, pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16,
i4_src_strd, i4_pred_strd,
i4_ngbr_avbl, &u4_intra_mode, &i4_mb_distortion_least,
u4_valid_intra_modes);
/* cost = distortion + lambda*rate */
i4_mb_cost_least = i4_mb_distortion_least;
if (( (u4_valid_intra_modes >> 3) & 1) != 0 && (ps_codec->s_cfg.u4_enc_speed_preset != IVE_FASTEST ||
ps_proc->i4_slice_type == ISLICE))
{
/* intra prediction for PLANE mode*/
(ps_codec->apf_intra_pred_16_l)[PLANE_I16x16](pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16_plane, 0, i4_pred_strd, i4_ngbr_avbl);
/* evaluate distortion between the actual blk and the estimated blk for the given mode */
ps_codec->apf_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_pred_mb_intra_16x16_plane, i4_src_strd, i4_pred_strd, i4_mb_cost_least, &i4_mb_distortion);
/* cost = distortion + lambda*rate */
i4_mb_cost = i4_mb_distortion;
/* update the least cost information if necessary */
if(i4_mb_cost < i4_mb_distortion_least)
{
u4_intra_mode = PLANE_I16x16;
i4_mb_cost_least = i4_mb_cost;
i4_mb_distortion_least = i4_mb_distortion;
}
}
u4_best_intra_16x16_mode = u4_intra_mode;
DEBUG("%d partition cost, %d intra mode\n", i4_mb_cost_least * 32, u4_best_intra_16x16_mode);
ps_proc->u1_l_i16_mode = u4_best_intra_16x16_mode;
/* cost = distortion + lambda*rate */
i4_mb_cost_least = i4_mb_distortion_least + u4_lambda*u1_uev_codelength[offset + u4_best_intra_16x16_mode];
/* update the type of the mb if necessary */
if (i4_mb_cost_least < ps_proc->i4_mb_cost)
{
ps_proc->i4_mb_cost = i4_mb_cost_least;
ps_proc->i4_mb_distortion = i4_mb_distortion_least;
ps_proc->u4_mb_type = I16x16;
}
return ;
}
/**
******************************************************************************
*
* @brief
* evaluate best intra 8x8 mode (rate distortion opt on)
*
* @par Description
* This function evaluates all the possible intra 8x8 modes and finds the mode
* that best represents the macro-block (least distortion) and occupies fewer
* bits in the bit-stream.
*
* @param[in] ps_proc_ctxt
* pointer to proc ctxt
*
* @remarks Ideally the cost of encoding a macroblock is calculated as
* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
* input block and the reconstructed block and rate is the number of bits taken
* to place the macroblock in the bit-stream. In this routine the rate does not
* exactly point to the total number of bits it takes, rather it points to header
* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
* and residual bits fall in to texture bits the number of bits taken to encoding
* mbtype is considered as rate, we compute cost. Further we will approximate
* the distortion as the deviation b/w input and the predicted block as opposed
* to input and reconstructed block.
*
* NOTE: TODO: This function needs to be tested
*
* @return none
*
******************************************************************************
*/
void ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
{
/* Codec Context */
codec_t *ps_codec = ps_proc->ps_codec;
/* SAD(distortion metric) of an 4x4 block */
WORD32 i4_partition_distortion, i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
/* lambda */
UWORD32 u4_lambda = ps_proc->u4_lambda;
/* cost = distortion + lambda*rate */
WORD32 i4_partition_cost, i4_partition_cost_least, i4_total_cost = u4_lambda;
/* cost due to mbtype */
UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
/* intra mode */
UWORD32 u4_intra_mode, u4_best_intra_8x8_mode = DC_I8x8, u4_estimated_intra_8x8_mode;
/* neighbor pels for intra prediction */
UWORD8 *pu1_ngbr_pels_i8 = ps_proc->au1_ngbr_pels;
/* pointer to curr partition */
UWORD8 *pu1_mb_curr;
/* pointer to prediction macro block */
UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
/* strides */
WORD32 i4_src_strd = ps_proc->i4_src_strd;
WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
/* neighbors left, top, top right, top left */
UWORD8 *pu1_mb_a;
UWORD8 *pu1_mb_b;
UWORD8 *pu1_mb_d;
/* neighbor availability */
WORD32 i4_ngbr_avbl;
block_neighbors_t s_ngbr_avbl;
/* temp vars */
UWORD32 b8, u4_pix_x, u4_pix_y;
/* ngbr mb syntax information */
UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
/* valid intra modes map */
UWORD32 u4_valid_intra_modes;
for(b8 = 0; b8 < 4; b8++)
{
u4_pix_x = (b8 & 0x01) << 3;
u4_pix_y = (b8 >> 1) << 3;
pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
/* when rdopt is off, we use the input as reference for constructing prediction buffer */
/* as opposed to using the recon pels. (open loop intra prediction) */
pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
/* locating neighbors that are available for prediction */
/* TODO : update the neighbor availability information basing on constrained intra pred information */
/* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
/* basing on neighbors available and hence evade the computation of neighbor availability totally. */
s_ngbr_avbl.u1_mb_a = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x - 1, u4_pix_y); /* xD = -1, yD = 0 */
s_ngbr_avbl.u1_mb_b = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x, u4_pix_y - 1); /* xD = 0, yD = -1 */
s_ngbr_avbl.u1_mb_c = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x + 8, u4_pix_y - 1); /* xD = BLK_8x8_SIZE, yD = -1 */
s_ngbr_avbl.u1_mb_d = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x - 1, u4_pix_y - 1); /* xD = -1, yD = -1 */
/* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_c * TOP_RIGHT_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + (s_ngbr_avbl.u1_mb_c << 3) +
(s_ngbr_avbl.u1_mb_a << 4);
/* if top partition is available and top right is not available for intra prediction, then */
/* padd top right samples using top sample and make top right also available */
/* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
ps_proc->ai4_neighbor_avail_8x8_subblks[b8] = i4_ngbr_avbl;
ih264_intra_pred_luma_8x8_mode_ref_filtering(pu1_mb_a, pu1_mb_b, pu1_mb_d, pu1_ngbr_pels_i8,
i4_src_strd, i4_ngbr_avbl);
i4_partition_cost_least = INT_MAX;
/* set valid intra modes for evaluation */
u4_valid_intra_modes = 0x1ff;
if (!s_ngbr_avbl.u1_mb_b)
{
u4_valid_intra_modes &= ~(1 << VERT_I4x4);
u4_valid_intra_modes &= ~(1 << DIAG_DL_I4x4);
u4_valid_intra_modes &= ~(1 << VERT_L_I4x4);
}
if (!s_ngbr_avbl.u1_mb_a)
{
u4_valid_intra_modes &= ~(1 << HORZ_I4x4);
u4_valid_intra_modes &= ~(1 << HORZ_U_I4x4);
}
if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b || !s_ngbr_avbl.u1_mb_d)
{
u4_valid_intra_modes &= ~(1 << DIAG_DR_I4x4);
u4_valid_intra_modes &= ~(1 << VERT_R_I4x4);
u4_valid_intra_modes &= ~(1 << HORZ_D_I4x4);
}
/* estimate the intra 8x8 mode for the current partition (for evaluating cost) */
if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
{
u4_estimated_intra_8x8_mode = DC_I8x8;
}
else
{
UWORD32 u4_left_intra_8x8_mode = DC_I8x8;
UWORD32 u4_top_intra_8x8_mode = DC_I8x8;
if (u4_pix_x == 0)
{
if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
{
u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[b8+1];
}
else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
{
u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[(b8+1)*4+2];
}
}
else
{
u4_left_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-1];
}
if (u4_pix_y == 0)
{
if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
{
u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[b8+2];
}
else if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
{
u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[(b8+2)*4+2];
}
}
else
{
u4_top_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-2];
}
u4_estimated_intra_8x8_mode = MIN(u4_left_intra_8x8_mode, u4_top_intra_8x8_mode);
}
/* perform intra mode 8x8 evaluation */
for (u4_intra_mode = VERT_I8x8; u4_valid_intra_modes != 0; u4_intra_mode++, u4_valid_intra_modes >>= 1)
{
if ( (u4_valid_intra_modes & 1) == 0)
continue;
/* intra prediction */
(ps_codec->apf_intra_pred_8_l)[u4_intra_mode](pu1_ngbr_pels_i8, pu1_pred_mb, 0, i4_pred_strd, i4_ngbr_avbl);
/* evaluate distortion between the actual blk and the estimated blk for the given mode */
ime_compute_sad_8x8(pu1_mb_curr, pu1_pred_mb, i4_src_strd, i4_pred_strd, i4_partition_cost_least, &i4_partition_distortion);
i4_partition_cost = i4_partition_distortion + ((u4_estimated_intra_8x8_mode == u4_intra_mode)?u4_cost_one_bit:u4_cost_four_bits);
/* update the least cost information if necessary */
if (i4_partition_cost < i4_partition_cost_least)
{
i4_partition_cost_least = i4_partition_cost;
i4_partition_distortion_least = i4_partition_distortion;
u4_best_intra_8x8_mode = u4_intra_mode;
}
}
/* macroblock distortion */
i4_total_cost += i4_partition_cost_least;
i4_total_distortion += i4_partition_distortion_least;
/* mb partition mode */
ps_proc->au1_intra_luma_mb_8x8_modes[b8] = u4_best_intra_8x8_mode;
}
/* update the type of the mb if necessary */
if (i4_total_cost < ps_proc->i4_mb_cost)
{
ps_proc->i4_mb_cost = i4_total_cost;
ps_proc->i4_mb_distortion = i4_total_distortion;
ps_proc->u4_mb_type = I8x8;
}
return ;
}
/**
******************************************************************************
*
* @brief
* evaluate best intra 4x4 mode (rate distortion opt off)
*
* @par Description
* This function evaluates all the possible intra 4x4 modes and finds the mode
* that best represents the macro-block (least distortion) and occupies fewer
* bits in the bit-stream.
*
* @param[in] ps_proc_ctxt
* pointer to proc ctxt
*
* @remarks
* Ideally the cost of encoding a macroblock is calculated as
* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
* input block and the reconstructed block and rate is the number of bits taken
* to place the macroblock in the bit-stream. In this routine the rate does not
* exactly point to the total number of bits it takes, rather it points to header
* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
* and residual bits fall in to texture bits the number of bits taken to encoding
* mbtype is considered as rate, we compute cost. Further we will approximate
* the distortion as the deviation b/w input and the predicted block as opposed
* to input and reconstructed block.
*
* NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
* 24*lambda is added to the SAD before comparison with the best SAD for
* inter prediction. This is an empirical value to prevent using too many intra
* blocks.
*
* @return none
*
******************************************************************************
*/
void ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
{
/* Codec Context */
codec_t *ps_codec = ps_proc->ps_codec;
/* SAD(distortion metric) of an 4x4 block */
WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
/* lambda */
UWORD32 u4_lambda = ps_proc->u4_lambda;
/* cost = distortion + lambda*rate */
WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
/* cost due to mbtype */
UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
/* intra mode */
UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
/* neighbor pels for intra prediction */
UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
/* pointer to curr partition */
UWORD8 *pu1_mb_curr;
/* pointer to prediction macro block */
UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
/* strides */
WORD32 i4_src_strd = ps_proc->i4_src_strd;
WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
/* neighbors left, top, top right, top left */
UWORD8 *pu1_mb_a;
UWORD8 *pu1_mb_b;
UWORD8 *pu1_mb_c;
UWORD8 *pu1_mb_d;
/* neighbor availability */
WORD32 i4_ngbr_avbl;
block_neighbors_t s_ngbr_avbl;
/* temp vars */
UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
/* scan order inside 4x4 block */
const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
/* ngbr sub mb modes */
UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
/* valid intra modes map */
UWORD32 u4_valid_intra_modes;
UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_c << 3);
memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
for (b8 = 0; b8 < 4; b8++)
{
u4_blk_x = (b8 & 0x01) << 3;
u4_blk_y = (b8 >> 1) << 3;
for (b4 = 0; b4 < 4; b4++)
{
u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
/* when rdopt is off, we use the input as reference for constructing prediction buffer */
/* as opposed to using the recon pels. (open loop intra prediction) */
pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
pu1_mb_c = pu1_mb_b + 4; /* pointer to top macro block */
pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
/* locating neighbors that are available for prediction */
/* TODO : update the neighbor availability information basing on constrained intra pred information */
/* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
/* basing on neighbors available and hence evade the computation of neighbor availability totally. */
i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
/* set valid intra modes for evaluation */
u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
/* if top partition is available and top right is not available for intra prediction, then */
/* padd top right samples using top sample and make top right also available */
/* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
/* gather prediction pels from the neighbors */
if (s_ngbr_avbl.u1_mb_a)
{
for(i = 0; i < 4; i++)
pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_src_strd];
}
else
{
memset(pu1_ngbr_pels_i4, 0, 4);
}
if (s_ngbr_avbl.u1_mb_b)
{
memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
}
else
{
memset(pu1_ngbr_pels_i4 + 5, 0, 4);
}
if (s_ngbr_avbl.u1_mb_d)
pu1_ngbr_pels_i4[4] = *pu1_mb_d;
else
pu1_ngbr_pels_i4[4] = 0;
if (s_ngbr_avbl.u1_mb_c)
{
memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
}
else if (s_ngbr_avbl.u1_mb_b)
{
memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
}
i4_partition_cost_least = INT_MAX;
/* predict the intra 4x4 mode for the current partition (for evaluating cost) */
if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
{
u4_estimated_intra_4x4_mode = DC_I4x4;
}
else
{
UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
if (u4_pix_x == 0)
{
if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
{
u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
}
else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
{
u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
}
}
else
{
u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
}
if (u4_pix_y == 0)
{
if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
{
u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
}
else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
{
u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
}
}
else
{
u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
}
u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
}
ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
/* mode evaluation and prediction */
ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
pu1_ngbr_pels_i4,
pu1_pred_mb, i4_src_strd,
i4_pred_strd, i4_ngbr_avbl,
&u4_best_intra_4x4_mode,
&i4_partition_cost_least,
u4_valid_intra_modes,
u4_lambda,
u4_estimated_intra_4x4_mode);
i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode) ? u4_cost_one_bit : u4_cost_four_bits);
DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
/* macroblock distortion */
i4_total_distortion += i4_partition_distortion_least;
i4_total_cost += i4_partition_cost_least;
/* mb partition mode */
ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
}
}
/* update the type of the mb if necessary */
if (i4_total_cost < ps_proc->i4_mb_cost)
{
ps_proc->i4_mb_cost = i4_total_cost;
ps_proc->i4_mb_distortion = i4_total_distortion;
ps_proc->u4_mb_type = I4x4;
}
return ;
}
/**
******************************************************************************
*
* @brief evaluate best intra 4x4 mode (rate distortion opt on)
*
* @par Description
* This function evaluates all the possible intra 4x4 modes and finds the mode
* that best represents the macro-block (least distortion) and occupies fewer
* bits in the bit-stream.
*
* @param[in] ps_proc_ctxt
* pointer to proc ctxt
*
* @remarks
* Ideally the cost of encoding a macroblock is calculated as
* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
* input block and the reconstructed block and rate is the number of bits taken
* to place the macroblock in the bit-stream. In this routine the rate does not
* exactly point to the total number of bits it takes, rather it points to header
* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
* and residual bits fall in to texture bits the number of bits taken to encoding
* mbtype is considered as rate, we compute cost. Further we will approximate
* the distortion as the deviation b/w input and the predicted block as opposed
* to input and reconstructed block.
*
* NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
* 24*lambda is added to the SAD before comparison with the best SAD for
* inter prediction. This is an empirical value to prevent using too many intra
* blocks.
*
* @return none
*
******************************************************************************
*/
void ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t *ps_proc)
{
/* Codec Context */
codec_t *ps_codec = ps_proc->ps_codec;
/* SAD(distortion metric) of an 4x4 block */
WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
/* lambda */
UWORD32 u4_lambda = ps_proc->u4_lambda;
/* cost = distortion + lambda*rate */
WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
/* cost due to mbtype */
UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
/* intra mode */
UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
/* neighbor pels for intra prediction */
UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
/* pointer to curr partition */
UWORD8 *pu1_mb_curr;
UWORD8 *pu1_mb_ref_left, *pu1_mb_ref_top;
UWORD8 *pu1_ref_mb_intra_4x4;
/* pointer to residual macro block */
WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
/* pointer to prediction macro block */
UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
/* strides */
WORD32 i4_src_strd = ps_proc->i4_src_strd;
WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
WORD32 i4_ref_strd_left, i4_ref_strd_top;
/* neighbors left, top, top right, top left */
UWORD8 *pu1_mb_a;
UWORD8 *pu1_mb_b;
UWORD8 *pu1_mb_c;
UWORD8 *pu1_mb_d;
/* number of non zero coeffs*/
UWORD8 *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
/* quantization parameters */
quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
/* neighbor availability */
WORD32 i4_ngbr_avbl;
block_neighbors_t s_ngbr_avbl;
/* temp vars */
UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
/* scan order inside 4x4 block */
const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
/* ngbr sub mb modes */
UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
/* valid intra modes map */
UWORD32 u4_valid_intra_modes;
UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
/* Dummy variable for 4x4 trans function */
WORD16 i2_dc_dummy;
/* compute ngbr availability for sub blks */
i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_c << 3);
memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
for(b8 = 0; b8 < 4; b8++)
{
u4_blk_x = (b8 & 0x01) << 3;
u4_blk_y = (b8 >> 1) << 3;
for(b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
{
u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4 + u4_pix_x + (u4_pix_y * i4_pred_strd);
pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
if (u4_pix_x == 0)
{
i4_ref_strd_left = ps_proc->i4_rec_strd;
pu1_mb_ref_left = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_left);
}
else
{
i4_ref_strd_left = i4_pred_strd;
pu1_mb_ref_left = pu1_ref_mb_intra_4x4;
}
if (u4_pix_y == 0)
{
i4_ref_strd_top = ps_proc->i4_rec_strd;
pu1_mb_ref_top = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_top);
}
else
{
i4_ref_strd_top = i4_pred_strd;
pu1_mb_ref_top = pu1_ref_mb_intra_4x4;
}
pu1_mb_a = pu1_mb_ref_left - 1; /* pointer to left macro block */
pu1_mb_b = pu1_mb_ref_top - i4_ref_strd_top; /* pointer to top macro block */
pu1_mb_c = pu1_mb_b + 4; /* pointer to top right macro block */
if (u4_pix_y == 0)
pu1_mb_d = pu1_mb_b - 1;
else
pu1_mb_d = pu1_mb_a - i4_ref_strd_left; /* pointer to top left macro block */
/* locating neighbors that are available for prediction */
/* TODO : update the neighbor availability information basing on constrained intra pred information */
/* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
/* basing on neighbors available and hence evade the computation of neighbor availability totally. */
i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
/* set valid intra modes for evaluation */
u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
/* if top partition is available and top right is not available for intra prediction, then */
/* padd top right samples using top sample and make top right also available */
/* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
/* gather prediction pels from the neighbors */
if (s_ngbr_avbl.u1_mb_a)
{
for(i = 0; i < 4; i++)
pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_ref_strd_left];
}
else
{
memset(pu1_ngbr_pels_i4,0,4);
}
if(s_ngbr_avbl.u1_mb_b)
{
memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
}
else
{
memset(pu1_ngbr_pels_i4 + 4 + 1, 0, 4);
}
if (s_ngbr_avbl.u1_mb_d)
pu1_ngbr_pels_i4[4] = *pu1_mb_d;
else
pu1_ngbr_pels_i4[4] = 0;
if (s_ngbr_avbl.u1_mb_c)
{
memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
}
else if (s_ngbr_avbl.u1_mb_b)
{
memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
}
i4_partition_cost_least = INT_MAX;
/* predict the intra 4x4 mode for the current partition (for evaluating cost) */
if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
{
u4_estimated_intra_4x4_mode = DC_I4x4;
}
else
{
UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
if (u4_pix_x == 0)
{
if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
{
u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
}
else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
{
u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
}
}
else
{
u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
}
if (u4_pix_y == 0)
{
if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
{
u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
}
else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
{
u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
}
}
else
{
u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
}
u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
}
ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
/*mode evaluation and prediction*/
ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
pu1_ngbr_pels_i4,
pu1_pred_mb, i4_src_strd,
i4_pred_strd, i4_ngbr_avbl,
&u4_best_intra_4x4_mode,
&i4_partition_cost_least,
u4_valid_intra_modes,
u4_lambda,
u4_estimated_intra_4x4_mode);
i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode)?u4_cost_one_bit:u4_cost_four_bits);
DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
/* macroblock distortion */
i4_total_distortion += i4_partition_distortion_least;
i4_total_cost += i4_partition_cost_least;
/* mb partition mode */
ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
/********************************************************/
/* error estimation, */
/* transform */
/* quantization */
/********************************************************/
ps_codec->pf_resi_trans_quant_4x4(pu1_mb_curr, pu1_pred_mb,
pi2_res_mb, i4_src_strd,
i4_pred_strd,
/* No op stride, this implies a buff of lenght 1x16 */
ps_qp_params->pu2_scale_mat,
ps_qp_params->pu2_thres_mat,
ps_qp_params->u1_qbits,
ps_qp_params->u4_dead_zone,
pu1_nnz, &i2_dc_dummy);
/********************************************************/
/* ierror estimation, */
/* itransform */
/* iquantization */
/********************************************************/
ps_codec->pf_iquant_itrans_recon_4x4(pi2_res_mb, pu1_pred_mb,
pu1_ref_mb_intra_4x4,
i4_pred_strd, i4_pred_strd,
ps_qp_params->pu2_iscale_mat,
ps_qp_params->pu2_weigh_mat,
ps_qp_params->u1_qp_div,
ps_proc->pv_scratch_buff, 0,
NULL);
}
}
/* update the type of the mb if necessary */
if (i4_total_cost < ps_proc->i4_mb_cost)
{
ps_proc->i4_mb_cost = i4_total_cost;
ps_proc->i4_mb_distortion = i4_total_distortion;
ps_proc->u4_mb_type = I4x4;
}
return ;
}
/**
******************************************************************************
*
* @brief
* evaluate best chroma intra 8x8 mode (rate distortion opt off)
*
* @par Description
* This function evaluates all the possible chroma intra 8x8 modes and finds
* the mode that best represents the macroblock (least distortion) and occupies
* fewer bits in the bitstream.
*
* @param[in] ps_proc_ctxt
* pointer to macroblock context (handle)
*
* @remarks
* For chroma best intra pred mode is calculated based only on SAD
*
* @returns none
*
******************************************************************************
*/
void ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
{
/* Codec Context */
codec_t *ps_codec = ps_proc->ps_codec;
/* SAD(distortion metric) of an 8x8 block */
WORD32 i4_mb_distortion, i4_chroma_mb_distortion;
/* intra mode */
UWORD32 u4_best_chroma_intra_8x8_mode = DC_CH_I8x8;
/* neighbor pels for intra prediction */
UWORD8 *pu1_ngbr_pels_c_i8x8 = ps_proc->au1_ngbr_pels;
/* pointer to curr macro block */
UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
/* pointer to prediction macro block */
UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
UWORD8 *pu1_pred_mb_plane = ps_proc->pu1_pred_mb_intra_chroma_plane;
/* strides */
WORD32 i4_src_strd_c = ps_proc->i4_src_chroma_strd;
WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
WORD32 i4_rec_strd_c = ps_proc->i4_rec_strd;
/* neighbors left, top, top left */
UWORD8 *pu1_mb_a = pu1_ref_mb - 2;
UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd_c;
UWORD8 *pu1_mb_d = pu1_mb_b - 2;
/* neighbor availability */
const UWORD8 u1_valid_intra_modes[8] = {1, 3, 9, 11, 5, 7, 13, 15,};
WORD32 i4_ngbr_avbl;
/* valid intra modes map */
UWORD32 u4_valid_intra_modes;
/* temp var */
UWORD8 i;
/* locating neighbors that are available for prediction */
/* TODO : update the neighbor availability information basing on constrained intra pred information */
/* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines
* basing on neighbors available and hence evade the computation of neighbor availability totally. */
/* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1);
ps_proc->i4_chroma_neighbor_avail_8x8_mb = i4_ngbr_avbl;
/* gather prediction pels from the neighbors */
/* left pels */
if (ps_proc->ps_ngbr_avbl->u1_mb_a)
{
for (i = 0; i < 16; i += 2)
{
pu1_ngbr_pels_c_i8x8[16 - 2 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c];
pu1_ngbr_pels_c_i8x8[16 - 1 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c + 1];
}
}
else
{
ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_c_i8x8, 0, MB_SIZE);
}
/* top pels */
if (ps_proc->ps_ngbr_avbl->u1_mb_b)
{
ps_codec->pf_mem_cpy_mul8(&pu1_ngbr_pels_c_i8x8[18], pu1_mb_b, 16);
}
else
{
ps_codec->pf_mem_set_mul8((pu1_ngbr_pels_c_i8x8 + 18), 0, MB_SIZE);
}
/* top left pels */
if (ps_proc->ps_ngbr_avbl->u1_mb_d)
{
pu1_ngbr_pels_c_i8x8[16] = *pu1_mb_d;
pu1_ngbr_pels_c_i8x8[17] = *(pu1_mb_d + 1);
}
u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST)
u4_valid_intra_modes &= ~(1 << PLANE_CH_I8x8);
i4_chroma_mb_distortion = INT_MAX;
/* perform intra mode chroma 8x8 evaluation */
/* intra prediction */
ps_codec->pf_ih264e_evaluate_intra_chroma_modes(pu1_curr_mb,
pu1_ngbr_pels_c_i8x8,
pu1_pred_mb,
i4_src_strd_c,
i4_pred_strd,
i4_ngbr_avbl,
&u4_best_chroma_intra_8x8_mode,
&i4_chroma_mb_distortion,
u4_valid_intra_modes);
if (u4_valid_intra_modes & 8)/* if Chroma PLANE is valid*/
{
(ps_codec->apf_intra_pred_c)[PLANE_CH_I8x8](pu1_ngbr_pels_c_i8x8, pu1_pred_mb_plane, 0, i4_pred_strd, i4_ngbr_avbl);
/* evaluate distortion(sad) */
ps_codec->pf_compute_sad_16x8(pu1_curr_mb, pu1_pred_mb_plane, i4_src_strd_c, i4_pred_strd, i4_chroma_mb_distortion, &i4_mb_distortion);
/* update the least distortion information if necessary */
if(i4_mb_distortion < i4_chroma_mb_distortion)
{
i4_chroma_mb_distortion = i4_mb_distortion;
u4_best_chroma_intra_8x8_mode = PLANE_CH_I8x8;
}
}
DEBUG("%d partition cost, %d intra mode\n", i4_chroma_mb_distortion, u4_best_chroma_intra_8x8_mode);
ps_proc->u1_c_i8_mode = u4_best_chroma_intra_8x8_mode;
return ;
}
/**
******************************************************************************
*
* @brief
* Evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the
* prediction.
*
* @par Description
* This function evaluates first three 16x16 modes and compute corresponding sad
* and return the buffer predicted with best mode.
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_ngbr_pels_i16
* UWORD8 pointer to neighbouring pels
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] u4_n_avblty
* availability of neighbouring pixels
*
* @param[in] u4_intra_mode
* Pointer to the variable in which best mode is returned
*
* @param[in] pu4_sadmin
* Pointer to the variable in which minimum sad is returned
*
* @param[in] u4_valid_intra_modes
* Says what all modes are valid
*
* @returns none
*
******************************************************************************
*/
void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
UWORD8 *pu1_ngbr_pels_i16,
UWORD8 *pu1_dst,
UWORD32 src_strd,
UWORD32 dst_strd,
WORD32 u4_n_avblty,
UWORD32 *u4_intra_mode,
WORD32 *pu4_sadmin,
UWORD32 u4_valid_intra_modes)
{
UWORD8 *pu1_neighbour;
UWORD8 *pu1_src_temp = pu1_src;
UWORD8 left = 0, top = 0;
WORD32 u4_dcval = 0;
WORD32 i, j;
WORD32 i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX, i4_sad_dc = INT_MAX,
i4_min_sad = INT_MAX;
UWORD8 val;
left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
/* left available */
if (left)
{
i4_sad_horz = 0;
for (i = 0; i < 16; i++)
{
val = pu1_ngbr_pels_i16[15 - i];
u4_dcval += val;
for (j = 0; j < 16; j++)
{
i4_sad_horz += ABS(val - pu1_src_temp[j]);
}
pu1_src_temp += src_strd;
}
u4_dcval += 8;
}
pu1_src_temp = pu1_src;
/* top available */
if (top)
{
i4_sad_vert = 0;
for (i = 0; i < 16; i++)
{
u4_dcval += pu1_ngbr_pels_i16[17 + i];
for (j = 0; j < 16; j++)
{
i4_sad_vert += ABS(pu1_ngbr_pels_i16[17 + j] - pu1_src_temp[j]);
}
pu1_src_temp += src_strd;
}
u4_dcval += 8;
}
u4_dcval = (u4_dcval) >> (3 + left + top);
pu1_src_temp = pu1_src;
/* none available */
u4_dcval += (left == 0) * (top == 0) * 128;
i4_sad_dc = 0;
for (i = 0; i < 16; i++)
{
for (j = 0; j < 16; j++)
{
i4_sad_dc += ABS(u4_dcval - pu1_src_temp[j]);
}
pu1_src_temp += src_strd;
}
if ((u4_valid_intra_modes & 04) == 0)/* If DC is disabled */
i4_sad_dc = INT_MAX;
if ((u4_valid_intra_modes & 01) == 0)/* If VERT is disabled */
i4_sad_vert = INT_MAX;
if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled */
i4_sad_horz = INT_MAX;
i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
/* Finding Minimum sad and doing corresponding prediction */
if (i4_min_sad < *pu4_sadmin)
{
*pu4_sadmin = i4_min_sad;
if (i4_min_sad == i4_sad_vert)
{
*u4_intra_mode = VERT_I16x16;
pu1_neighbour = pu1_ngbr_pels_i16 + 17;
for (j = 0; j < 16; j++)
{
memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
pu1_dst += dst_strd;
}
}
else if (i4_min_sad == i4_sad_horz)
{
*u4_intra_mode = HORZ_I16x16;
for (j = 0; j < 16; j++)
{
val = pu1_ngbr_pels_i16[15 - j];
memset(pu1_dst, val, MB_SIZE);
pu1_dst += dst_strd;
}
}
else
{
*u4_intra_mode = DC_I16x16;
for (j = 0; j < 16; j++)
{
memset(pu1_dst, u4_dcval, MB_SIZE);
pu1_dst += dst_strd;
}
}
}
return;
}
/**
******************************************************************************
*
* @brief
* Evaluate best intra 4x4 mode and perform prediction.
*
* @par Description
* This function evaluates 4x4 modes and compute corresponding sad
* and return the buffer predicted with best mode.
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_ngbr_pels
* UWORD8 pointer to neighbouring pels
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] u4_n_avblty
* availability of neighbouring pixels
*
* @param[in] u4_intra_mode
* Pointer to the variable in which best mode is returned
*
* @param[in] pu4_sadmin
* Pointer to the variable in which minimum cost is returned
*
* @param[in] u4_valid_intra_modes
* Says what all modes are valid
*
* @param[in] u4_lambda
* Lamda value for computing cost from SAD
*
* @param[in] u4_predictd_mode
* Predicted mode for cost computation
*
* @returns none
*
******************************************************************************
*/
void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src,
UWORD8 *pu1_ngbr_pels,
UWORD8 *pu1_dst,
UWORD32 src_strd,
UWORD32 dst_strd,
WORD32 u4_n_avblty,
UWORD32 *u4_intra_mode,
WORD32 *pu4_sadmin,
UWORD32 u4_valid_intra_modes,
UWORD32 u4_lambda,
UWORD32 u4_predictd_mode)
{
UWORD8 *pu1_src_temp = pu1_src;
UWORD8 *pu1_pred = pu1_ngbr_pels;
UWORD8 left = 0, top = 0;
UWORD8 u1_pred_val = 0;
UWORD8 u1_pred_vals[4] = {0};
UWORD8 *pu1_pred_val = NULL;
/* To store FILT121 operated values*/
UWORD8 u1_pred_vals_diag_121[15] = {0};
/* To store FILT11 operated values*/
UWORD8 u1_pred_vals_diag_11[15] = {0};
UWORD8 u1_pred_vals_vert_r[8] = {0};
UWORD8 u1_pred_vals_horz_d[10] = {0};
UWORD8 u1_pred_vals_horz_u[10] = {0};
WORD32 u4_dcval = 0;
WORD32 i4_sad[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
INT_MAX, INT_MAX, INT_MAX, INT_MAX};
WORD32 i4_cost[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
INT_MAX, INT_MAX, INT_MAX, INT_MAX};
WORD32 i, i4_min_cost = INT_MAX;
left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
/* Computing SAD */
/* VERT mode valid */
if (u4_valid_intra_modes & 1)
{
pu1_pred = pu1_ngbr_pels + 5;
i4_sad[VERT_I4x4] = 0;
i4_cost[VERT_I4x4] = 0;
USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
i4_cost[VERT_I4x4] = i4_sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ?
u4_lambda : 4 * u4_lambda);
}
/* HORZ mode valid */
if (u4_valid_intra_modes & 2)
{
i4_sad[HORZ_I4x4] = 0;
i4_cost[HORZ_I4x4] =0;
pu1_src_temp = pu1_src;
u1_pred_val = pu1_ngbr_pels[3];
i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
+ ABS(pu1_src_temp[1] - u1_pred_val)
+ ABS(pu1_src_temp[2] - u1_pred_val)
+ ABS(pu1_src_temp[3] - u1_pred_val);
pu1_src_temp += src_strd;
u1_pred_val = pu1_ngbr_pels[2];
i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
+ ABS(pu1_src_temp[1] - u1_pred_val)
+ ABS(pu1_src_temp[2] - u1_pred_val)
+ ABS(pu1_src_temp[3] - u1_pred_val);
pu1_src_temp += src_strd;
u1_pred_val = pu1_ngbr_pels[1];
i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
+ ABS(pu1_src_temp[1] - u1_pred_val)
+ ABS(pu1_src_temp[2] - u1_pred_val)
+ ABS(pu1_src_temp[3] - u1_pred_val);
pu1_src_temp += src_strd;
u1_pred_val = pu1_ngbr_pels[0];
i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
+ ABS(pu1_src_temp[1] - u1_pred_val)
+ ABS(pu1_src_temp[2] - u1_pred_val)
+ ABS(pu1_src_temp[3] - u1_pred_val);
i4_cost[HORZ_I4x4] = i4_sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ?
u4_lambda : 4 * u4_lambda);
}
/* DC mode valid */
if (u4_valid_intra_modes & 4)
{
i4_sad[DC_I4x4] = 0;
i4_cost[DC_I4x4] = 0;
pu1_src_temp = pu1_src;
if (left)
u4_dcval = pu1_ngbr_pels[0] + pu1_ngbr_pels[1] + pu1_ngbr_pels[2]
+ pu1_ngbr_pels[3] + 2;
if (top)
u4_dcval += pu1_ngbr_pels[5] + pu1_ngbr_pels[6] + pu1_ngbr_pels[7]
+ pu1_ngbr_pels[8] + 2;
u4_dcval = (u4_dcval) ? (u4_dcval >> (1 + left + top)) : 128;
/* none available */
memset(u1_pred_vals, u4_dcval, 4);
USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
pu1_src_temp += src_strd;
i4_cost[DC_I4x4] = i4_sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ?
u4_lambda : 4 * u4_lambda);
}
/* if modes other than VERT, HORZ and DC are valid */
if (u4_valid_intra_modes > 7)
{
pu1_pred = pu1_ngbr_pels;
pu1_pred[13] = pu1_pred[14] = pu1_pred[12];
/* Performing FILT121 and FILT11 operation for all neighbour values*/
for (i = 0; i < 13; i++)
{
u1_pred_vals_diag_121[i] = FILT121(pu1_pred[0], pu1_pred[1], pu1_pred[2]);
u1_pred_vals_diag_11[i] = FILT11(pu1_pred[0], pu1_pred[1]);
pu1_pred++;
}
if (u4_valid_intra_modes & 8)/* DIAG_DL */
{
i4_sad[DIAG_DL_I4x4] = 0;
i4_cost[DIAG_DL_I4x4] = 0;
pu1_src_temp = pu1_src;
pu1_pred_val = u1_pred_vals_diag_121 + 5;
USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DL_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val + 1), i4_sad[DIAG_DL_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[DIAG_DL_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val + 3), i4_sad[DIAG_DL_I4x4]);
pu1_src_temp += src_strd;
i4_cost[DIAG_DL_I4x4] = i4_sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ?
u4_lambda : 4 * u4_lambda);
}
if (u4_valid_intra_modes & 16)/* DIAG_DR */
{
i4_sad[DIAG_DR_I4x4] = 0;
i4_cost[DIAG_DR_I4x4] = 0;
pu1_src_temp = pu1_src;
pu1_pred_val = u1_pred_vals_diag_121 + 3;
USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DR_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val - 1), i4_sad[DIAG_DR_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val - 2), i4_sad[DIAG_DR_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val - 3), i4_sad[DIAG_DR_I4x4]);
pu1_src_temp += src_strd;
i4_cost[DIAG_DR_I4x4] = i4_sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ?
u4_lambda : 4 * u4_lambda);
}
if (u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/
{
i4_sad[VERT_R_I4x4] = 0;
pu1_src_temp = pu1_src;
u1_pred_vals_vert_r[0] = u1_pred_vals_diag_121[2];
memcpy((u1_pred_vals_vert_r + 1), (u1_pred_vals_diag_11 + 4), 3);
u1_pred_vals_vert_r[4] = u1_pred_vals_diag_121[1];
memcpy((u1_pred_vals_vert_r + 5), (u1_pred_vals_diag_121 + 3), 3);
pu1_pred_val = u1_pred_vals_diag_11 + 4;
USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
pu1_pred_val = u1_pred_vals_diag_121 + 3;
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (u1_pred_vals_vert_r), i4_sad[VERT_R_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (u1_pred_vals_vert_r + 4),
i4_sad[VERT_R_I4x4]);
i4_cost[VERT_R_I4x4] = i4_sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ?
u4_lambda : 4 * u4_lambda);
}
if (u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/
{
i4_sad[HORZ_D_I4x4] = 0;
pu1_src_temp = pu1_src;
u1_pred_vals_horz_d[6] = u1_pred_vals_diag_11[3];
memcpy((u1_pred_vals_horz_d + 7), (u1_pred_vals_diag_121 + 3), 3);
u1_pred_vals_horz_d[0] = u1_pred_vals_diag_11[0];
u1_pred_vals_horz_d[1] = u1_pred_vals_diag_121[0];
u1_pred_vals_horz_d[2] = u1_pred_vals_diag_11[1];
u1_pred_vals_horz_d[3] = u1_pred_vals_diag_121[1];
u1_pred_vals_horz_d[4] = u1_pred_vals_diag_11[2];
u1_pred_vals_horz_d[5] = u1_pred_vals_diag_121[2];
pu1_pred_val = u1_pred_vals_horz_d;
USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_D_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_D_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_D_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_D_I4x4]);
i4_cost[HORZ_D_I4x4] = i4_sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ?
u4_lambda : 4 * u4_lambda);
}
if (u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/
{
i4_sad[VERT_L_I4x4] = 0;
pu1_src_temp = pu1_src;
pu1_pred_val = u1_pred_vals_diag_11 + 5;
USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
pu1_src_temp += src_strd;
pu1_pred_val = u1_pred_vals_diag_121 + 5;
USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
pu1_src_temp += src_strd;
pu1_pred_val = u1_pred_vals_diag_11 + 6;
USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
pu1_src_temp += src_strd;
pu1_pred_val = u1_pred_vals_diag_121 + 6;
USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
i4_cost[VERT_L_I4x4] = i4_sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ?
u4_lambda : 4 * u4_lambda);
}
if (u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/
{
i4_sad[HORZ_U_I4x4] = 0;
pu1_src_temp = pu1_src;
u1_pred_vals_horz_u[0] = u1_pred_vals_diag_11[2];
u1_pred_vals_horz_u[1] = u1_pred_vals_diag_121[1];
u1_pred_vals_horz_u[2] = u1_pred_vals_diag_11[1];
u1_pred_vals_horz_u[3] = u1_pred_vals_diag_121[0];
u1_pred_vals_horz_u[4] = u1_pred_vals_diag_11[0];
u1_pred_vals_horz_u[5] = FILT121(pu1_ngbr_pels[0], pu1_ngbr_pels[0], pu1_ngbr_pels[1]);
memset((u1_pred_vals_horz_u + 6), pu1_ngbr_pels[0], 4);
pu1_pred_val = u1_pred_vals_horz_u;
USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_U_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_U_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_U_I4x4]);
pu1_src_temp += src_strd;
USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_U_I4x4]);
i4_cost[HORZ_U_I4x4] = i4_sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ?
u4_lambda : 4 * u4_lambda);
}
i4_min_cost = MIN3(MIN3(i4_cost[0], i4_cost[1], i4_cost[2]),
MIN3(i4_cost[3], i4_cost[4], i4_cost[5]),
MIN3(i4_cost[6], i4_cost[7], i4_cost[8]));
}
else
{
/* Only first three modes valid */
i4_min_cost = MIN3(i4_cost[0], i4_cost[1], i4_cost[2]);
}
*pu4_sadmin = i4_min_cost;
if (i4_min_cost == i4_cost[0])
{
*u4_intra_mode = VERT_I4x4;
pu1_pred_val = pu1_ngbr_pels + 5;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val), 4);
}
else if (i4_min_cost == i4_cost[1])
{
*u4_intra_mode = HORZ_I4x4;
memset(pu1_dst, pu1_ngbr_pels[3], 4);
pu1_dst += dst_strd;
memset(pu1_dst, pu1_ngbr_pels[2], 4);
pu1_dst += dst_strd;
memset(pu1_dst, pu1_ngbr_pels[1], 4);
pu1_dst += dst_strd;
memset(pu1_dst, pu1_ngbr_pels[0], 4);
}
else if (i4_min_cost == i4_cost[2])
{
*u4_intra_mode = DC_I4x4;
memset(pu1_dst, u4_dcval, 4);
pu1_dst += dst_strd;
memset(pu1_dst, u4_dcval, 4);
pu1_dst += dst_strd;
memset(pu1_dst, u4_dcval, 4);
pu1_dst += dst_strd;
memset(pu1_dst, u4_dcval, 4);
}
else if (i4_min_cost == i4_cost[3])
{
*u4_intra_mode = DIAG_DL_I4x4;
pu1_pred_val = u1_pred_vals_diag_121 + 5;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val + 1), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val + 2), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val + 3), 4);
}
else if (i4_min_cost == i4_cost[4])
{
*u4_intra_mode = DIAG_DR_I4x4;
pu1_pred_val = u1_pred_vals_diag_121 + 3;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val - 1), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val - 2), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val - 3), 4);
}
else if (i4_min_cost == i4_cost[5])
{
*u4_intra_mode = VERT_R_I4x4;
pu1_pred_val = u1_pred_vals_diag_11 + 4;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
pu1_pred_val = u1_pred_vals_diag_121 + 3;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (u1_pred_vals_vert_r), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (u1_pred_vals_vert_r + 4), 4);
}
else if (i4_min_cost == i4_cost[6])
{
*u4_intra_mode = HORZ_D_I4x4;
pu1_pred_val = u1_pred_vals_horz_d;
memcpy(pu1_dst, (pu1_pred_val + 6), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val + 4), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val + 2), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
}
else if (i4_min_cost == i4_cost[7])
{
*u4_intra_mode = VERT_L_I4x4;
pu1_pred_val = u1_pred_vals_diag_11 + 5;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
pu1_pred_val = u1_pred_vals_diag_121 + 5;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
pu1_pred_val = u1_pred_vals_diag_11 + 6;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
pu1_pred_val = u1_pred_vals_diag_121 + 6;
memcpy(pu1_dst, (pu1_pred_val), 4);
}
else if (i4_min_cost == i4_cost[8])
{
*u4_intra_mode = HORZ_U_I4x4;
pu1_pred_val = u1_pred_vals_horz_u;
memcpy(pu1_dst, (pu1_pred_val), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val + 2), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val + 4), 4);
pu1_dst += dst_strd;
memcpy(pu1_dst, (pu1_pred_val + 6), 4);
pu1_dst += dst_strd;
}
return;
}
/**
******************************************************************************
*
* @brief:
* Evaluate best intr chroma mode (among VERT, HORZ and DC ) and do the prediction.
*
* @par Description
* This function evaluates first three intra chroma modes and compute corresponding sad
* and return the buffer predicted with best mode.
*
* @param[in] pu1_src
* UWORD8 pointer to the source
*
* @param[in] pu1_ngbr_pels
* UWORD8 pointer to neighbouring pels
*
* @param[out] pu1_dst
* UWORD8 pointer to the destination
*
* @param[in] src_strd
* integer source stride
*
* @param[in] dst_strd
* integer destination stride
*
* @param[in] u4_n_avblty
* availability of neighbouring pixels
*
* @param[in] u4_intra_mode
* Pointer to the variable in which best mode is returned
*
* @param[in] pu4_sadmin
* Pointer to the variable in which minimum sad is returned
*
* @param[in] u4_valid_intra_modes
* Says what all modes are valid
*
* @return none
*
******************************************************************************
*/
void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
UWORD8 *pu1_ngbr_pels,
UWORD8 *pu1_dst,
UWORD32 src_strd,
UWORD32 dst_strd,
WORD32 u4_n_avblty,
UWORD32 *u4_intra_mode,
WORD32 *pu4_sadmin,
UWORD32 u4_valid_intra_modes)
{
UWORD8 *pu1_neighbour;
UWORD8 *pu1_src_temp = pu1_src;
UWORD8 left = 0, top = 0;
WORD32 u4_dcval_u_l[2] = { 0, 0 }, /*sum left neighbours for 'U' ,two separate sets - sum of first four from top,and sum of four values from bottom */
u4_dcval_u_t[2] = { 0, 0 }; /*sum top neighbours for 'U'*/
WORD32 u4_dcval_v_l[2] = { 0, 0 }, /*sum left neighbours for 'V'*/
u4_dcval_v_t[2] = { 0, 0 }; /*sum top neighbours for 'V'*/
WORD32 i, j, row, col, i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX,
i4_sad_dc = INT_MAX, i4_min_sad = INT_MAX;
UWORD8 val_u, val_v;
WORD32 u4_dc_val[2][2][2];/* -----------
| | | Chroma can have four
| 00 | 01 | separate dc value...
----------- u4_dc_val corresponds to this dc values
| | | with u4_dc_val[2][2][U] and u4_dc_val[2][2][V]
| 10 | 11 |
----------- */
left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
/*Evaluating HORZ*/
if (left)/* Ifleft available*/
{
i4_sad_horz = 0;
for (i = 0; i < 8; i++)
{
val_v = pu1_ngbr_pels[15 - 2 * i];
val_u = pu1_ngbr_pels[15 - 2 * i - 1];
row = i / 4;
u4_dcval_u_l[row] += val_u;
u4_dcval_v_l[row] += val_v;
for (j = 0; j < 8; j++)
{
i4_sad_horz += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for HORZ mode*/
i4_sad_horz += ABS(val_v - pu1_src_temp[2 * j + 1]);
}
pu1_src_temp += src_strd;
}
u4_dcval_u_l[0] += 2;
u4_dcval_u_l[1] += 2;
u4_dcval_v_l[0] += 2;
u4_dcval_v_l[1] += 2;
}
/*Evaluating VERT**/
pu1_src_temp = pu1_src;
if (top) /* top available*/
{
i4_sad_vert = 0;
for (i = 0; i < 8; i++)
{
col = i / 4;
val_u = pu1_ngbr_pels[18 + i * 2];
val_v = pu1_ngbr_pels[18 + i * 2 + 1];
u4_dcval_u_t[col] += val_u;
u4_dcval_v_t[col] += val_v;
for (j = 0; j < 16; j++)
{
i4_sad_vert += ABS(pu1_ngbr_pels[18 + j] - pu1_src_temp[j]);/* Finding SAD for VERT mode*/
}
pu1_src_temp += src_strd;
}
u4_dcval_u_t[0] += 2;
u4_dcval_u_t[1] += 2;
u4_dcval_v_t[0] += 2;
u4_dcval_v_t[1] += 2;
}
/* computing DC value*/
/* Equation 8-128 in spec*/
u4_dc_val[0][0][0] = (u4_dcval_u_l[0] + u4_dcval_u_t[0]) >> (1 + left + top);
u4_dc_val[0][0][1] = (u4_dcval_v_l[0] + u4_dcval_v_t[0]) >> (1 + left + top);
u4_dc_val[1][1][0] = (u4_dcval_u_l[1] + u4_dcval_u_t[1]) >> (1 + left + top);
u4_dc_val[1][1][1] = (u4_dcval_v_l[1] + u4_dcval_v_t[1]) >> (1 + left + top);
if (top)
{
/* Equation 8-132 in spec*/
u4_dc_val[0][1][0] = (u4_dcval_u_t[1]) >> (1 + top);
u4_dc_val[0][1][1] = (u4_dcval_v_t[1]) >> (1 + top);
}
else
{
u4_dc_val[0][1][0] = (u4_dcval_u_l[0]) >> (1 + left);
u4_dc_val[0][1][1] = (u4_dcval_v_l[0]) >> (1 + left);
}
if (left)
{
u4_dc_val[1][0][0] = (u4_dcval_u_l[1]) >> (1 + left);
u4_dc_val[1][0][1] = (u4_dcval_v_l[1]) >> (1 + left);
}
else
{
u4_dc_val[1][0][0] = (u4_dcval_u_t[0]) >> (1 + top);
u4_dc_val[1][0][1] = (u4_dcval_v_t[0]) >> (1 + top);
}
if (!(left || top))
{
/*none available*/
u4_dc_val[0][0][0] = u4_dc_val[0][0][1] =
u4_dc_val[0][1][0] = u4_dc_val[0][1][1] =
u4_dc_val[1][0][0] = u4_dc_val[1][0][1] =
u4_dc_val[1][1][0] = u4_dc_val[1][1][1] = 128;
}
/* Evaluating DC */
pu1_src_temp = pu1_src;
i4_sad_dc = 0;
for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
col = j / 4;
row = i / 4;
val_u = u4_dc_val[row][col][0];
val_v = u4_dc_val[row][col][1];
i4_sad_dc += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for DC mode*/
i4_sad_dc += ABS(val_v - pu1_src_temp[2 * j + 1]);
}
pu1_src_temp += src_strd;
}
if ((u4_valid_intra_modes & 01) == 0)/* If DC is disabled*/
i4_sad_dc = INT_MAX;
if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled*/
i4_sad_horz = INT_MAX;
if ((u4_valid_intra_modes & 04) == 0)/* If VERT is disabled*/
i4_sad_vert = INT_MAX;
i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
/* Finding Minimum sad and doing corresponding prediction*/
if (i4_min_sad < *pu4_sadmin)
{
*pu4_sadmin = i4_min_sad;
if (i4_min_sad == i4_sad_dc)
{
*u4_intra_mode = DC_CH_I8x8;
for (i = 0; i < 8; i++)
{
for (j = 0; j < 8; j++)
{
col = j / 4;
row = i / 4;
pu1_dst[2 * j] = u4_dc_val[row][col][0];
pu1_dst[2 * j + 1] = u4_dc_val[row][col][1];
}
pu1_dst += dst_strd;
}
}
else if (i4_min_sad == i4_sad_horz)
{
*u4_intra_mode = HORZ_CH_I8x8;
for (j = 0; j < 8; j++)
{
val_v = pu1_ngbr_pels[15 - 2 * j];
val_u = pu1_ngbr_pels[15 - 2 * j - 1];
for (i = 0; i < 8; i++)
{
pu1_dst[2 * i] = val_u;
pu1_dst[2 * i + 1] = val_v;
}
pu1_dst += dst_strd;
}
}
else
{
*u4_intra_mode = VERT_CH_I8x8;
pu1_neighbour = pu1_ngbr_pels + 18;
for (j = 0; j < 8; j++)
{
memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
pu1_dst += dst_strd;
}
}
}
return;
}