avx2 intrinsics support for libavc(sw decoder)
Some checks failed
CMake / build (push) Has been cancelled

This patch includes handwritten avx2 intrinsics to optimize the libavc sw decoder
by reducing CPU-cycles overhead on module : libcodec2_soft_avcdec.

Playing 1024 resolution video playback on the Galley App with HW decoder disabled:
cpu-cycles overhead(%) reduced by ~15%.
Loading of video thumbnails on Gallery/Photos App is faster (we have pushed approx
more than 30 videos as a part of the usecase): cpu-cycles overhead(%) have reduced by ~10%.This patch is related to s/w video decoding.

Signed-off-by: Priyanka Bose <priyanka.bose@intel.corp-partner.google.com>
This commit is contained in:
Priyanka Bose 2025-11-10 13:36:27 +05:30 committed by Harish Mahendrakar
parent 9331596eef
commit ca7461442c
19 changed files with 2817 additions and 8 deletions

View file

@ -236,6 +236,56 @@ cc_defaults {
},
}
cc_library_static {
name: "libavc_avx2",
defaults: ["libavc_dec_defaults"],
visibility: ["//visibility:private"],
export_include_dirs: [
"common",
"decoder",
],
arch: {
x86: {
cflags: [
"-mavx2",
"-mfma",
],
srcs: [
"common/x86/ih264_ihadamard_scaling_avx2.c",
"common/x86/ih264_deblk_chroma_avx2.c",
"common/x86/ih264_deblk_luma_avx2.c",
"common/x86/ih264_iquant_itrans_recon_avx2.c",
"common/x86/ih264_weighted_pred_avx2.c",
"common/x86/ih264_inter_pred_filters_avx2.c",
"decoder/x86/ih264d_function_selector_avx2.c",
],
},
x86_64: {
cflags: [
"-mavx2",
"-mfma",
],
srcs: [
"common/x86/ih264_ihadamard_scaling_avx2.c",
"common/x86/ih264_deblk_chroma_avx2.c",
"common/x86/ih264_deblk_luma_avx2.c",
"common/x86/ih264_iquant_itrans_recon_avx2.c",
"common/x86/ih264_weighted_pred_avx2.c",
"common/x86/ih264_inter_pred_filters_avx2.c",
"decoder/x86/ih264d_function_selector_avx2.c",
],
},
},
sanitize: {
blocklist: "libavc_blocklist.txt",
},
apex_available: [
"//apex_available:platform", // used by libstagefright_soft_avcdec
"com.android.media.swcodec",
],
min_sdk_version: "29",
}
cc_library_static {
name: "libavcdec",
defaults: ["libavc_dec_defaults"],
@ -380,6 +430,10 @@ cc_library_static {
"decoder/x86/ih264d_function_selector_sse42.c",
"decoder/x86/ih264d_function_selector_ssse3.c",
],
whole_static_libs: [
"libavc_avx2",
],
},
x86_64: {
@ -400,6 +454,9 @@ cc_library_static {
"decoder/x86/ih264d_function_selector_sse42.c",
"decoder/x86/ih264d_function_selector_ssse3.c",
],
whole_static_libs: [
"libavc_avx2",
],
},
},

View file

@ -8,7 +8,7 @@ function(libavc_add_compile_options)
elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch32")
add_compile_options(-march=armv7-a -mfpu=neon)
else()
add_compile_options(-msse4.2 -mno-avx)
add_compile_options(-msse4.2 -mavx2 -mfma)
endif()
add_compile_options(-Wdeclaration-after-statement)
@ -45,8 +45,8 @@ function(libavc_add_definitions)
elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch32")
add_definitions(-DARMV7 -DDEFAULT_ARCH=D_ARCH_ARM_A9Q)
else()
add_definitions(-DX86 -DX86_LINUX=1 -DDISABLE_AVX2
-DDEFAULT_ARCH=D_ARCH_X86_SSE42)
add_definitions(-DX86 -DX86_LINUX=1
-DDEFAULT_ARCH=D_ARCH_X86_SSE42)
endif()
endfunction()

View file

@ -108,7 +108,13 @@ else()
"${AVC_ROOT}/common/x86/ih264_mem_fns_ssse3.c"
"${AVC_ROOT}/common/x86/ih264_padding_ssse3.c"
"${AVC_ROOT}/common/x86/ih264_resi_trans_quant_sse42.c"
"${AVC_ROOT}/common/x86/ih264_weighted_pred_sse42.c")
"${AVC_ROOT}/common/x86/ih264_weighted_pred_sse42.c"
"${AVC_ROOT}/common/x86/ih264_ihadamard_scaling_avx2.c"
"${AVC_ROOT}/common/x86/ih264_deblk_chroma_avx2.c"
"${AVC_ROOT}/common/x86/ih264_deblk_luma_avx2.c"
"${AVC_ROOT}/common/x86/ih264_iquant_itrans_recon_avx2.c"
"${AVC_ROOT}/common/x86/ih264_weighted_pred_avx2.c"
"${AVC_ROOT}/common/x86/ih264_inter_pred_filters_avx2.c")
include_directories(${AVC_ROOT}/common/x86)
endif()

View file

@ -159,4 +159,12 @@ ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_ssse3;
ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_ssse3;
ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff_ssse3;
/* AVX2 */
ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_avx2;
ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_avx2;
ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4_avx2;
ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_avx2;
#endif /* _IH264_DEBLK_EDGE_FILTERS_H_ */

View file

@ -138,4 +138,11 @@ ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3;
ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3;
ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_ssse3;
/* AVX2 Intrinsic Declarations */
ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_avx2;
ih264_inter_pred_luma_ft ih264_inter_pred_luma_copy_avx2;
ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
#endif /* _IH264_INTER_PRED_FILTERS_H_ */

View file

@ -231,4 +231,10 @@ ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_sse42;
ih264_hadamard_quant_ft ih264_hadamard_quant_4x4_sse42;
ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv_sse42;
/*AVX2 Declarations*/
ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_avx2;
ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv_avx2;
ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_avx2;
#endif /* _IH264_TRANS_QUANT_ITRANS_IQUANT_H_ */

View file

@ -106,5 +106,10 @@ ih264_weighted_pred_ft ih264_weighted_pred_chroma_sse42;
ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_sse42;
ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_sse42;
/* AVX2 */
ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_avx2;
ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_avx2;
#endif /* _IH264_WEIGHTED_PRED_H_ */

View file

@ -0,0 +1,386 @@
/******************************************************************************
*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/*****************************************************************************/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
/* System include files */
#include <stdio.h>
#ifdef __ANDROID__
#include "log/log.h"
#include <cutils/log.h>
#endif
/* User include files */
#include "ih264_typedefs.h"
#include "ih264_platform_macros.h"
#include "ih264_deblk_edge_filters.h"
#include "ih264_macros.h"
#include <stdint.h>
#include <string.h>
#include <immintrin.h>
/*****************************************************************************/
/* */
/* Function Name : ih264_deblk_chroma_vert_bslt4_avx2() */
/* */
/* Description : This function performs filtering of a chroma block */
/* vertical edge when the boundary strength is less than 4 */
/* in high profile. */
/* */
/* Inputs : pu1_src - pointer to the src sample q0 of U */
/* src_strd - source stride */
/* alpha_cb - alpha value for the boundary in U */
/* beta_cb - beta value for the boundary in U */
/* alpha_cr - alpha value for the boundary in V */
/* beta_cr - beta value for the boundary in V */
/* u4_bs - packed Boundary strength array */
/* pu1_cliptab_cb - tc0_table for U */
/* pu1_cliptab_cr - tc0_table for V */
/* */
/* Globals : None */
/* */
/* Processing : This operation is described in Sec. 8.7.2.3 under the */
/* title "Filtering process for edges for bS less than 4" */
/* in ITU T Rec H.264 with alpha and beta values different */
/* in U and V. */
/* */
/* Outputs : None */
/* */
/* Returns : None */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 12 02 2015 Naveen Kumar P Initial version */
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
/*****************************************************************************/
void ih264_deblk_chroma_vert_bslt4_avx2(UWORD8 *pu1_src,
WORD32 src_strd,
WORD32 alpha_cb,
WORD32 beta_cb,
WORD32 alpha_cr,
WORD32 beta_cr,
UWORD32 u4_bs,
const UWORD8 *pu1_cliptab_cb,
const UWORD8 *pu1_cliptab_cr)
{
UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
__m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
__m256i lineab, linecd, lineef, linegh, lineae, linebf, linecg, linedh;
__m256i temp1, temp2, temp3, temp4;
__m256i t1,t3, t2,t4,pq0_uv_32x8,pq1_uv_32x8,tmp1,tmp2,p0_uv_8x32,q0_uv_8x32;
__m256i pq0_uv_8x32, pq1_uv_8x32, p1_uv_8x32,pq0_uv_8x32_1,pq0_uv_8x32_2;
__m256i flag_bs, flag1, flag2;
__m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
__m256i zero = _mm256_setzero_si256();
__m256i C0_uv_8x32;
__m256i p0_uv_8x32_1, p0_uv_8x32_2, q0_uv_8x32_1, q0_uv_8x32_2,p0_uv_32x8_1,q0_uv_32x8_1;
u1_Bs0 = (u4_bs >> 24) & 0xff;
u1_Bs1 = (u4_bs >> 16) & 0xff;
u1_Bs2 = (u4_bs >> 8) & 0xff;
u1_Bs3 = (u4_bs >> 0) & 0xff;
flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask
/* Load and transpose the pixel values */
lineab = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + src_strd), (__m128i *)(pu1_src_uv - 4));
linecd = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), (__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
lineef = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), (__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
linegh = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), (__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
temp1 = _mm256_unpacklo_epi64(lineab, zero); //a0 -- a7 000.. b0..b7 000
temp2 = _mm256_unpacklo_epi64(linecd, zero);
temp3 = _mm256_unpacklo_epi64(lineef, zero); //e0 -- e7 000.. f0..f7 000
temp4 = _mm256_unpacklo_epi64(linegh, zero);
temp1 = _mm256_unpacklo_epi16(temp1, temp2); //a0 a1 c0 c1 -- a6 a7 c6 c7 b0 b1 d0 d1.. b6 b7 d6 d7
temp2 = _mm256_unpacklo_epi16(temp3, temp4); //e0 e1 g0 g1 f0 f1 h0 h1
t2 = _mm256_permute2f128_si256(temp1, temp2, 0x20);
t3 = _mm256_permute2f128_si256(temp1, temp2, 0x31);
tmp1 = _mm256_unpacklo_epi16(t2, t3); //a0 a1 b0 b1 c0 c1 d0 d1 -a2 a3 b2 b3 .... e0 e1 f0 f1 g0 g1 h0 h1 -e2 e3..
tmp2 = _mm256_unpackhi_epi16(t2, t3); //a4 a5 b4 b5 -a6 a7 b6 b7
temp1 = _mm256_unpacklo_epi8(tmp1,zero); // a0 0 a1 0 b0 0 b1 0 c0 0 c1 0 d0 0 d1 0 - e0 0 e1 0 .. => p1
temp2 = _mm256_unpackhi_epi8(tmp1,zero); // a2 0 a3 0 => p0
temp3 = _mm256_unpacklo_epi8(tmp2,zero); //a4 0 a5 0 => q0
temp4 = _mm256_unpackhi_epi8(tmp2,zero); //a6 0 a7 0 => q1
pq1_uv_32x8 = _mm256_packus_epi16(temp1,temp4); // 0213
pq0_uv_32x8 = _mm256_packus_epi16(temp2,temp3); //0213
diff = _mm256_subs_epi16(temp2, temp3); //Condn 1 (p0 -q0) - set (3), set(3)
diff = _mm256_abs_epi16(diff);
alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);
diff = _mm256_subs_epi16(temp4, temp3); //Condtn 2 (q1 -q0)
diff = _mm256_abs_epi16(diff);
beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
diff = _mm256_subs_epi16(temp1, temp2); //Condtn 3 (p1 -p0)
diff = _mm256_abs_epi16(diff);
flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
diff = _mm256_subs_epi16(temp3, temp2); //(q0 -p0)
diff = _mm256_slli_epi16(diff, 2);
diff1 = _mm256_subs_epi16(temp1, temp4); //(p1 -q1)
diff = _mm256_add_epi16(diff, diff1);
diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
in_macro = _mm256_srai_epi16(diff, 3);
C0_uv_8x32 = _mm256_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));
in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);
p0_uv_8x32_1 = _mm256_add_epi16(temp2, in_macro);
q0_uv_8x32_1 = _mm256_sub_epi16(temp3, in_macro);
flag1 = _mm256_and_si256(flag1, flag_bs);
flag1 = _mm256_packs_epi16(flag1, flag1); // 0213
pq0_uv_8x32 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1); //0213
pq0_uv_8x32_1 = _mm256_and_si256(pq0_uv_32x8,
_mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
pq0_uv_8x32_2 = _mm256_and_si256(pq0_uv_8x32, flag1);
pq0_uv_32x8 = _mm256_add_epi8(pq0_uv_8x32_1, pq0_uv_8x32_2);
t1 = _mm256_unpacklo_epi16(pq1_uv_32x8, pq0_uv_32x8); // temp1 temp3
t2 = _mm256_unpackhi_epi16(pq1_uv_32x8, pq0_uv_32x8); // temp2 temp4
t4 = _mm256_shufflelo_epi16(t2, _MM_SHUFFLE(2, 3, 0, 1)); // pshuflw
t4 = _mm256_shufflehi_epi16(t4, _MM_SHUFFLE(2, 3, 0, 1));
lineae = _mm256_unpacklo_epi32(t1, t4); // temp1 temp3
linecg = _mm256_unpackhi_epi32(t1, t4); // temp2 temp4
linea = _mm256_castsi256_si128(lineae);
lineb = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));
lineae = _mm256_permute2f128_si256(lineae, lineae, 0x1);
linee = _mm256_castsi256_si128(lineae);
linef = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));
linec = _mm256_castsi256_si128(linecg);
lined = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));
linecg = _mm256_permute2f128_si256(linecg, linecg, 0x1);
lineg = _mm256_castsi256_si128(linecg);
lineh = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
}
/*****************************************************************************/
/* */
/* Function Name : ih264_deblk_chroma_horz_bslt4_avx2() */
/* */
/* Description : This function performs filtering of a chroma block */
/* horizontal edge when the boundary strength is less than */
/* 4 in high profile. */
/* */
/* Inputs : pu1_src - pointer to the src sample q0 of U */
/* src_strd - source stride */
/* alpha_cb - alpha value for the boundary in U */
/* beta_cb - beta value for the boundary in U */
/* alpha_cr - alpha value for the boundary in V */
/* beta_cr - beta value for the boundary in V */
/* u4_bs - packed Boundary strength array */
/* pu1_cliptab_cb - tc0_table for U */
/* pu1_cliptab_cr - tc0_table for V */
/* */
/* Globals : None */
/* */
/* Processing : This operation is described in Sec. 8.7.2.3 under the */
/* title "Filtering process for edges for bS less than 4" */
/* in ITU T Rec H.264 with alpha and beta values different */
/* in U and V. */
/* */
/* Outputs : None */
/* */
/* Returns : None */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 12 02 2015 Naveen Kumar P Initial version */
/* 12 10 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
/*****************************************************************************/
void ih264_deblk_chroma_horz_bslt4_avx2 (UWORD8 *pu1_src,
WORD32 src_strd,
WORD32 alpha_cb,
WORD32 beta_cb,
WORD32 alpha_cr,
WORD32 beta_cr,
UWORD32 u4_bs,
const UWORD8 *pu1_cliptab_cb,
const UWORD8 *pu1_cliptab_cr)
{
UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
WORD16 i16_posP1, i16_posP0, i16_posQ1;
UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
__m256i p0q0_uv_32x8,p1q1_uv_32x8;
__m256i temp1,temp2,temp3,temp4;
__m256i flag_bs, flag1, flag2;
__m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
__m256i zero = _mm256_setzero_si256();
__m256i C0_uv_8x32;
__m256i p0q0_uv_8x32_1, p0q0_uv_8x32_2,res1,res2,p0_uv_8x32_1,q0_uv_8x32_1;
pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
i16_posQ1 = src_strd;
i16_posP0 = src_strd;
i16_posP1 = 0;
u1_Bs0 = (u4_bs >> 24) & 0xff;
u1_Bs1 = (u4_bs >> 16) & 0xff;
u1_Bs2 = (u4_bs >> 8) & 0xff;
u1_Bs3 = (u4_bs >> 0) & 0xff;
flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,
u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask
p0q0_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv), (__m128i *)(pu1_HorzPixelUV + i16_posP0));
p1q1_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv + i16_posQ1), (__m128i *)(pu1_HorzPixelUV + i16_posP1));
res1 = _mm256_permute4x64_epi64(p0q0_uv_32x8,0xD8);
res2 = _mm256_permute4x64_epi64(p1q1_uv_32x8,0xD8);
temp3 = _mm256_unpacklo_epi8(res1, zero); //p0 l 0 h 0
temp4 = _mm256_unpackhi_epi8(res1, zero); //q0
temp1 = _mm256_unpacklo_epi8(res2, zero); //p1
temp2 = _mm256_unpackhi_epi8(res2, zero); //q1
diff = _mm256_subs_epi16(temp3, temp4); //Condn 1 //p0 l h - q0 l h
diff = _mm256_abs_epi16(diff);
alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);
diff = _mm256_subs_epi16(temp2, temp4); //Condtn 2
diff = _mm256_abs_epi16(diff);
beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
diff = _mm256_subs_epi16(temp1, temp3); //Condtn 3
diff = _mm256_abs_epi16(diff);
flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
diff = _mm256_subs_epi16(temp4, temp3);
diff = _mm256_slli_epi16(diff, 2);
diff1 = _mm256_subs_epi16(temp1, temp2);
diff = _mm256_add_epi16(diff, diff1);
diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
in_macro = _mm256_srai_epi16(diff, 3);
C0_uv_8x32 = _mm256_set_epi16(
pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));
in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);
p0_uv_8x32_1 = _mm256_add_epi16(temp3, in_macro);
q0_uv_8x32_1 = _mm256_sub_epi16(temp4, in_macro);
p0q0_uv_8x32_2 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1);
flag1 = _mm256_packs_epi16(flag1, flag1);
flag1 = _mm256_and_si256(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
p0q0_uv_8x32_1 = _mm256_and_si256(res1,
_mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
p0q0_uv_8x32_2 = _mm256_and_si256(p0q0_uv_8x32_2, flag1);
p0q0_uv_8x32_1 = _mm256_add_epi8(p0q0_uv_8x32_1, p0q0_uv_8x32_2);
p0q0_uv_8x32_1 = _mm256_permute4x64_epi64(p0q0_uv_8x32_1,0xD8);
_mm256_storeu2_m128i((__m128i *)(pu1_src_uv),(__m128i *)(pu1_HorzPixelUV + i16_posP0), p0q0_uv_8x32_1);
}

View file

@ -0,0 +1,275 @@
/******************************************************************************
*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/*****************************************************************************/
/* */
/* File Name : ih264_deblk_luma_avx2.c */
/* */
/* Description : Contains function definitions for deblocking */
/* */
/* List of Functions : ih264_deblk_luma_horz_bslt4_avx2() */
/* ih264_deblk_luma_vert_bslt4_avx2() */
/* */
/* Issues / Problems : None */
/* */
/* Revision History : */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 12 02 2015 Naveen Kumar P Added luma deblocking ssse3 */
/* intrinsics */
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
/*****************************************************************************/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
/* System include files */
#include <stdio.h>
#ifdef __ANDROID__
#include "log/log.h"
#include <cutils/log.h>
#endif
/* User include files */
#include "ih264_typedefs.h"
#include "ih264_platform_macros.h"
#include "ih264_deblk_edge_filters.h"
#include "ih264_macros.h"
/*****************************************************************************/
/* */
/* Function Name : ih264_deblk_luma_horz_bslt4_avx2() */
/* */
/* Description : This function performs filtering of a luma block */
/* horizontal edge when boundary strength is less than 4. */
/* */
/* Inputs : pu1_src - pointer to the src sample q0 */
/* src_strd - source stride */
/* alpha - alpha value for the boundary */
/* beta - beta value for the boundary */
/* u4_bs - packed Boundary strength array */
/* pu1_cliptab - tc0_table */
/* */
/* Globals : None */
/* */
/* Processing : This operation is described in Sec. 8.7.2.3 under the */
/* title "Filtering process for edges for bS less than 4" */
/* in ITU T Rec H.264. */
/* */
/* Outputs : None */
/* */
/* Returns : None */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
/* 12 02 2015 Naveen Kumar P Initial version */
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
/*****************************************************************************/
void ih264_deblk_luma_horz_bslt4_avx2(UWORD8 *pu1_src,
WORD32 src_strd,
WORD32 alpha,
WORD32 beta,
UWORD32 u4_bs,
const UWORD8 *pu1_cliptab)
{
WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
UWORD8 *pu1_HorzPixel;
__m256i zero = _mm256_setzero_si256();
__m128i zero_128 = _mm_setzero_si128();
__m128i Alpha_8x16,bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16;
__m256i Beta_8x32,in_macro_32x8,in_macro_1,in_macro_2,flag1_32x8,flag2_32x8;
__m256i C_8x32,C0_8x32_res,temp1,temp2,temp3,temp4,res1,res2,q0p1_32x8,p0q1_32x8;
__m128i p0_16x8,q0_16x8,temp1_128,temp2_128,flag1_16x8_128;
__m256i const_val4_8x32,p0q0_32x8,p1q1_32x8,p2q2_32x8,q0p0_32x8;
UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
UWORD8 clip0, clip1, clip2, clip3;
pu1_HorzPixel = pu1_src - (src_strd << 2);
i16_posQ1 = src_strd;
i16_posQ2 = X2(src_strd);
i16_posP0 = X3(src_strd);
i16_posP1 = X2(src_strd);
i16_posP2 = src_strd;
p0q0_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src), (__m128i *)(pu1_HorzPixel + i16_posP0)); //lower -p0 higher-q0
p1q1_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ1), (__m128i *)(pu1_HorzPixel + i16_posP1)); //l= p1, h=q1
p2q2_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ2), (__m128i *)(pu1_HorzPixel + i16_posP2));
u1_Bs0 = (u4_bs >> 24) & 0xff;
u1_Bs1 = (u4_bs >> 16) & 0xff;
u1_Bs2 = (u4_bs >> 8) & 0xff;
u1_Bs3 = (u4_bs >> 0) & 0xff;
clip0 = pu1_cliptab[u1_Bs0];
clip1 = pu1_cliptab[u1_Bs1];
clip2 = pu1_cliptab[u1_Bs2];
clip3 = pu1_cliptab[u1_Bs3];
Alpha_8x16 = _mm_set1_epi16(alpha);
Beta_8x32 = _mm256_set1_epi16(beta);
bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
clip2, clip1, clip1, clip1, clip1, clip0, clip0,
clip0, clip0);
bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero_128);
bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero_128);
C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero_128);
C0_8x32_res = _mm256_set_m128i(C0_hi_8x16,C0_8x16);
//Cond1 (ABS(p0 - q0) < alpha)
p0_16x8 = _mm256_castsi256_si128(p0q0_32x8);
q0p0_32x8 = _mm256_permute2x128_si256(p0q0_32x8, p0q0_32x8, 0x1);
q0_16x8 = _mm256_castsi256_si128(p0q0_32x8);
temp1_128 = _mm_subs_epu8(q0_16x8, p0_16x8);
temp2_128 = _mm_subs_epu8(p0_16x8, q0_16x8);
temp1_128 = _mm_add_epi8(temp1_128, temp2_128);
temp2_128 = _mm_unpacklo_epi8(temp1_128, zero_128);
temp1_128 = _mm_unpackhi_epi8(temp1_128, zero_128);
temp2_128 = _mm_cmpgt_epi16(Alpha_8x16, temp2_128);
temp1_128 = _mm_cmpgt_epi16(Alpha_8x16, temp1_128);
flag1_16x8_128 = _mm_packs_epi16(temp2_128, temp1_128);
flag1_16x8_128 = _mm_and_si128(flag1_16x8_128, bs_flag_16x8b);
flag1_32x8 = _mm256_set_m128i(flag1_16x8_128,flag1_16x8_128);
//Cond2 (ABS(q1 - q0) < beta) & Cond3 (ABS(p1 - p0) < beta)
temp1 = _mm256_subs_epu8(p0q0_32x8, p1q1_32x8);
temp2 = _mm256_subs_epu8(p1q1_32x8, p0q0_32x8);
temp1 = _mm256_add_epi8(temp1, temp2);
temp2 = _mm256_unpacklo_epi8(temp1, zero);
temp1 = _mm256_unpackhi_epi8(temp1, zero);
temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);
flag2_32x8 = _mm256_packs_epi16(temp2, temp1);
//!((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
flag1_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);
//(ABS(p2 - p0) < beta) & (ABS(q2 - q0) < beta)
temp1 = _mm256_subs_epu8(p0q0_32x8, p2q2_32x8);
temp2 = _mm256_subs_epu8(p2q2_32x8, p0q0_32x8);
temp1 = _mm256_add_epi8(temp1, temp2);
temp2 = _mm256_unpacklo_epi8(temp1, zero);
temp1 = _mm256_unpackhi_epi8(temp1, zero);
temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);
flag2_32x8 = _mm256_packs_epi16(temp2, temp1);
flag2_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);
temp2 = _mm256_subs_epi16(zero, temp2);
temp1 = _mm256_subs_epi16(zero, temp1);
temp3 = _mm256_permute2x128_si256(temp2,temp1,0x20); // low adding
temp4 = _mm256_permute2x128_si256(temp2,temp1,0x31); //high adding
temp2 = _mm256_add_epi16(temp3,temp4);
C_8x32 = _mm256_add_epi16(C0_8x32_res, temp2); //
const_val4_8x32 = _mm256_set1_epi16(4);
res1 = _mm256_permute4x64_epi64(q0p0_32x8, 0xD8);
res2 = _mm256_permute4x64_epi64(p1q1_32x8, 0xD8);
temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res1, zero),
_mm256_unpackhi_epi8(res1, zero));
temp4 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res2, zero),
_mm256_unpackhi_epi8(res2, zero));
temp1 = _mm256_slli_epi16(temp3, 2);
temp1 = _mm256_add_epi16(temp1, temp4);
temp1 = _mm256_add_epi16(temp1, const_val4_8x32);
in_macro_32x8 = _mm256_srai_epi16(temp1, 3);
in_macro_32x8 = _mm256_min_epi16(C_8x32, in_macro_32x8); //CLIP3
C_8x32 = _mm256_subs_epi16(zero, C_8x32);
in_macro_32x8 = _mm256_max_epi16(C_8x32, in_macro_32x8); //CLIP3
temp3 = _mm256_unpacklo_epi8(res1, zero); //q0
temp4 = _mm256_unpackhi_epi8(res1, zero); //p0
temp1 = _mm256_add_epi16(temp4, in_macro_32x8);
temp2 = _mm256_sub_epi16(temp3, in_macro_32x8);
temp1 = _mm256_packus_epi16(temp2, temp1); // Suffle needed
temp1 = _mm256_and_si256(temp1, flag1_32x8); //q0 p0
temp2 = _mm256_and_si256(res1,
_mm256_xor_si256(flag1_32x8, _mm256_set1_epi16(0xFFFF)));
temp1 = _mm256_add_epi8(temp1, temp2);
temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
_mm256_storeu2_m128i((__m128i *)(pu1_HorzPixel + i16_posP0),(__m128i *)(pu1_src),temp1);
//if(Ap < Beta) if(Aq < Beta)
temp1 = _mm256_avg_epu16(_mm256_unpacklo_epi8(res1, zero),
_mm256_unpackhi_epi8(res1, zero));
temp2 = _mm256_slli_epi16(_mm256_unpacklo_epi8(p1q1_32x8, zero), 1);
temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(p2q2_32x8, zero), temp2);
temp2 = _mm256_slli_epi16(_mm256_unpackhi_epi8(p1q1_32x8, zero), 1);
temp2 = _mm256_subs_epi16(_mm256_unpackhi_epi8(p2q2_32x8, zero), temp2);
temp4 = _mm256_permute2x128_si256(temp3, temp2, 0x20); //p0 q0
temp3 = _mm256_permute2x128_si256(temp3, temp2, 0x31);
temp4 = _mm256_add_epi16(temp1, temp4); //p
in_macro_1 = _mm256_srai_epi16(temp4, 1);
temp3 = _mm256_add_epi16(temp1, temp3); //q
in_macro_2 = _mm256_srai_epi16(temp3, 1);
in_macro_1 = _mm256_min_epi16(C0_8x32_res, in_macro_1); //CLIP3
C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
in_macro_1 = _mm256_max_epi16(C0_8x32_res, in_macro_1); //CLIP3
in_macro_2 = _mm256_max_epi16(C0_8x32_res, in_macro_2); //CLIP3
C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
in_macro_2 = _mm256_min_epi16(C0_8x32_res, in_macro_2); //CLIP3
temp1 = _mm256_unpacklo_epi8(res2, zero);
temp2 = _mm256_unpackhi_epi8(res2, zero);
temp1 = _mm256_add_epi16(temp1, in_macro_1);
temp2 = _mm256_add_epi16(temp2, in_macro_2);
temp1 = _mm256_packus_epi16(temp1, temp2); // pl ph ql qh
temp1 = _mm256_and_si256(temp1, flag2_32x8);
temp2 = _mm256_and_si256(res2,_mm256_xor_si256(flag2_32x8, _mm256_set1_epi16(0xFFFF)));
temp1 = _mm256_add_epi8(temp1, temp2);
temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
_mm256_storeu2_m128i((__m128i *)(pu1_src + i16_posQ1),(__m128i *)(pu1_HorzPixel + i16_posP1),temp1);
}

View file

@ -0,0 +1,184 @@
/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
/**
+ *******************************************************************************
+ * @file
+ * ih264_ihadamard_scaling_avx2.c
+ *
+ * @brief
+ * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling
+ *
+ * @author
+ * Priyanka
+ *
+ * @par List of Functions:
+ * - ih264_ihadamard_scaling_4x4_avx2()
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
/* User include files */
#include "ih264_typedefs.h"
#include "ih264_defs.h"
#include "ih264_trans_macros.h"
#include "ih264_macros.h"
#include "ih264_trans_data.h"
#include "ih264_size_defs.h"
#include "ih264_structs.h"
#include "ih264_trans_quant_itrans_iquant.h"
#include <immintrin.h>
/*
+ ********************************************************************************
+ *
+ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+ * of a 16x16 intra prediction macroblock, and then performs scaling.
+ * prediction buffer
+ *
+ * @par Description:
+ * The DC coefficients pass through a 2-stage inverse hadamard transform.
+ * This inverse transformed content is scaled to based on Qp value.
+ *
+ * @param[in] pi2_src
+ * input 4x4 block of DC coefficients
+ *
+ * @param[out] pi2_out
+ * output 4x4 block
+ *
+ * @param[in] pu2_iscal_mat
+ * pointer to scaling list
+ *
+ * @param[in] pu2_weigh_mat
+ * pointer to weight matrix
+ *
+ * @param[in] u4_qp_div_6
+ * Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
*/
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#ifdef __ANDROID__
#include "log/log.h"
#include <cutils/log.h>
#endif
void ih264_ihadamard_scaling_4x4_avx2(WORD16* pi2_src,
WORD16* pi2_out,
const UWORD16 *pu2_iscal_mat,
const UWORD16 *pu2_weigh_mat,
UWORD32 u4_qp_div_6,
WORD32* pi4_tmp)
{
__m256i src,r0_r1,r2_r3,r3_r2,r1_r3,r0_r2;
__m256i src_r0_r1, src_r2_r3;
__m256i temp0, temp1,tmp0, tmp1, tmp2, tmp3;
__m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0);
__m256i mult_val = _mm256_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
__m256i zero = _mm256_setzero_si256();
__m128i t0 ,t1;
UNUSED (pi4_tmp);
src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
temp0 = _mm256_unpacklo_epi64(src_r0_r1, zero);
temp1 = _mm256_unpackhi_epi64(src_r0_r1, zero); // b0 b1 b2.. d0 d1...
temp0 = _mm256_unpacklo_epi16(temp0, temp1);
tmp0 = _mm256_permute2x128_si256(temp0,zero,0x20); //tmp0 tmp3
tmp1 = _mm256_permute2x128_si256(temp0,zero,0x31); //tmp1 tmp2
temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
temp1 = _mm256_shuffle_epi32(temp1,0b01001110);
tmp0 = _mm256_add_epi16(temp0, temp1);
tmp1 = _mm256_sub_epi16(temp0, temp1);
temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);
temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
tmp0 = _mm256_add_epi16(temp0, temp1);
tmp1 = _mm256_sub_epi16(temp0, temp1);
temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
temp0 = _mm256_unpacklo_epi64(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
tmp0 = _mm256_unpacklo_epi16(temp0, temp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
tmp1 = _mm256_unpackhi_epi16(temp0, temp1);
temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
temp1 = _mm256_shuffle_epi32(temp1, _MM_SHUFFLE(1, 0, 3, 2));
tmp0 = _mm256_add_epi16(temp0, temp1);
tmp1 = _mm256_sub_epi16(temp0, temp1);
temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);
temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
tmp0 = _mm256_add_epi16(temp0, temp1);
tmp1 = _mm256_sub_epi16(temp0, temp1);
temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);
temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
r0_r1 =_mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp0));
r2_r3 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp1));
src_r0_r1 = _mm256_mullo_epi32(r0_r1, mult_val);
src_r2_r3 = _mm256_mullo_epi32(r2_r3, mult_val);
//Scaling
if(u4_qp_div_6 >= 6)
{
src_r0_r1 = _mm256_slli_epi32(src_r0_r1, u4_qp_div_6 - 6);
src_r2_r3 = _mm256_slli_epi32(src_r2_r3, u4_qp_div_6 - 6);
}
else
{
temp0 = _mm256_add_epi32(src_r0_r1, add_rshift);
temp1 = _mm256_add_epi32(src_r2_r3, add_rshift);
src_r0_r1 = _mm256_srai_epi32(temp0, 6 - u4_qp_div_6);
src_r2_r3 = _mm256_srai_epi32(temp1, 6 - u4_qp_div_6);
}
src = _mm256_packs_epi32(src_r0_r1, src_r2_r3);
_mm256_storeu_si256((__m256i *) (&pi2_out[0]), src);
}

View file

@ -0,0 +1,959 @@
/******************************************************************************
*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
#ifdef __ANDROID__
#include "log/log.h"
#include <cutils/log.h>
#endif
#include <immintrin.h>
#include "ih264_typedefs.h"
#include "ih264_macros.h"
#include "ih264_platform_macros.h"
#include "ih264_inter_pred_filters.h"
/*****************************************************************************/
/* Constant Data variables */
/*****************************************************************************/
/* coefficients for 6 tap filtering*/
//const WORD32 ih264_g_six_tap[3] ={1,-5,20};
/*****************************************************************************/
/* Function definitions . */
/*****************************************************************************/
/*****************************************************************************/
/* */
/* Function Name : ih264_inter_pred_luma_copy_avx2 */
/* */
/* Description : This function copies the contents of ht x wd block from */
/* source to destination. (ht,wd) can be (4,4), (8,4), */
/* (4,8), (8,8), (16,8), (8,16) or (16,16). */
/* */
/* Inputs : puc_src - pointer to source */
/* puc_dst - pointer to destination */
/* src_strd - stride for source */
/* dst_strd - stride for destination */
/* ht - height of the block */
/* wd - width of the block */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 13 02 2015 Kaushik Initial Version */
/* Senthoor */
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
/*****************************************************************************/
void ih264_inter_pred_luma_copy_avx2(UWORD8 *pu1_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 ht,
WORD32 wd,
UWORD8* pu1_tmp,
WORD32 dydx)
{
WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4;
UNUSED(pu1_tmp);
UNUSED(dydx);
src_strd2 = src_strd << 1;
dst_strd2 = dst_strd << 1;
src_strd4 = src_strd << 2;
dst_strd4 = dst_strd << 2;
src_strd3 = src_strd2 + src_strd;
dst_strd3 = dst_strd2 + dst_strd;
if(wd == 4)
{
do
{
*((WORD32 *)(pu1_dst)) = *((WORD32 *)(pu1_src));
*((WORD32 *)(pu1_dst + dst_strd)) = *((WORD32 *)(pu1_src + src_strd));
*((WORD32 *)(pu1_dst + dst_strd2)) = *((WORD32 *)(pu1_src + src_strd2));
*((WORD32 *)(pu1_dst + dst_strd3)) = *((WORD32 *)(pu1_src + src_strd3));
ht -= 4;
pu1_src += src_strd4;
pu1_dst += dst_strd4;
}
while(ht > 0);
}
else if(wd == 8)
{
__m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
do
{
y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2));
y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3));
_mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b);
ht -= 4;
pu1_src += src_strd4;
pu1_dst += dst_strd4;
}
while(ht > 0);
}
else // wd == 16
{
__m256i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
WORD32 src_strd5, src_strd6, src_strd7, src_strd8;
WORD32 dst_strd5, dst_strd6, dst_strd7, dst_strd8;
__m256i y_4_16x8b, y_5_16x8b, y_6_16x8b, y_7_16x8b,y_0_1,y_2_3,y_4_5,y_6_7;
src_strd5 = src_strd2 + src_strd3;
dst_strd5 = dst_strd2 + dst_strd3;
src_strd6 = src_strd3 << 1;
dst_strd6 = dst_strd3 << 1;
src_strd7 = src_strd3 + src_strd4;
dst_strd7 = dst_strd3 + dst_strd4;
src_strd8 = src_strd << 3;
dst_strd8 = dst_strd << 3;
do
{
y_0_1 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd),(__m128i *)pu1_src);
y_2_3 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd3),(__m128i *)(pu1_src + src_strd2));
y_4_5 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd5),(__m128i *)(pu1_src + src_strd4));
y_6_7 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd7),(__m128i *)(pu1_src + src_strd6));
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)pu1_dst,y_0_1);
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd3),(__m128i *)(pu1_dst + dst_strd2),y_2_3);
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd5),(__m128i *)(pu1_dst + dst_strd4),y_4_5);
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd7),(__m128i *)(pu1_dst + dst_strd6),y_6_7);
ht -= 8;
pu1_src += src_strd8;
pu1_dst += dst_strd8;
}
while(ht > 0);
}
}
/*****************************************************************************/
/* */
/* Function Name : ih264_inter_pred_chroma_avx2 */
/* */
/* Description : This function implements a four-tap 2D filter as */
/* mentioned in sec. 8.4.2.2.2 titled "Chroma sample */
/* "interpolation process". (ht,wd) can be (2,2), (4,2), */
/* (2,4), (4,4), (8,4), (4,8) or (8,8). */
/* */
/* Inputs : puc_src - pointer to source */
/* puc_dst - pointer to destination */
/* src_strd - stride for source */
/* dst_strd - stride for destination */
/* dx - x position of destination value */
/* dy - y position of destination value */
/* ht - height of the block */
/* wd - width of the block */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 13 02 2015 Kaushik Initial Version */
/* Senthoor */
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
/*****************************************************************************/
void ih264_inter_pred_chroma_avx2(UWORD8 *pu1_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 dx,
WORD32 dy,
WORD32 ht,
WORD32 wd)
{
WORD32 i, j, A, B, C, D;
i = 8 - dx;
j = 8 - dy;
A = i * j;
B = dx * j;
C = i * dy;
D = dx * dy;
if(wd == 2)
{
WORD32 tmp1, tmp2, tmp3, tmp4;
do
{
//U
tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
//V
tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];
tmp1 = (tmp1 + 32) >> 6;
tmp2 = (tmp2 + 32) >> 6;
tmp3 = (tmp3 + 32) >> 6;
tmp4 = (tmp4 + 32) >> 6;
pu1_dst[0] = CLIP_U8(tmp1);
pu1_dst[2] = CLIP_U8(tmp2);
pu1_dst[1] = CLIP_U8(tmp3);
pu1_dst[3] = CLIP_U8(tmp4);
pu1_src += src_strd;
pu1_dst += dst_strd;
tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];
tmp1 = (tmp1 + 32) >> 6;
tmp2 = (tmp2 + 32) >> 6;
tmp3 = (tmp3 + 32) >> 6;
tmp4 = (tmp4 + 32) >> 6;
pu1_dst[0] = CLIP_U8(tmp1);
pu1_dst[2] = CLIP_U8(tmp2);
pu1_dst[1] = CLIP_U8(tmp3);
pu1_dst[3] = CLIP_U8(tmp4);
ht -= 2;
pu1_src += src_strd;
pu1_dst += dst_strd;
}
while(ht > 0);
}
else if(wd == 4)
{
WORD32 AB, CD;
__m256i coeffAB_32x8b, coeffCD_32x8b, round_add32_8x32b;
__m256i const_shuff_32x8b;
__m256i src_r23_32x8b,src_r12_32x8b,res12_AB_8x32b,res12_CD_8x32b,res1_8x32b;
__m128i res1,src_r1_16x8b_128;
__m128i const_shuff_16x8b_128;
AB = (B << 8) + A;
CD = (D << 8) + C;
coeffAB_32x8b = _mm256_set1_epi16(AB);
coeffCD_32x8b = _mm256_set1_epi16(CD);
round_add32_8x32b = _mm256_set1_epi16(32);
const_shuff_16x8b_128 = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806);
const_shuff_32x8b = _mm256_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806,0x03010200, 0x05030402, 0x07050604, 0x09070806);
src_r1_16x8b_128 = _mm_loadu_si128((__m128i *)pu1_src);
src_r1_16x8b_128 = _mm_shuffle_epi8(src_r1_16x8b_128, const_shuff_16x8b_128);
pu1_src += src_strd;
do
{
src_r23_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd),(__m128i *)pu1_src);
src_r23_32x8b = _mm256_shuffle_epi8(src_r23_32x8b, const_shuff_32x8b);
src_r12_32x8b = _mm256_set_m128i(_mm256_castsi256_si128(src_r23_32x8b),src_r1_16x8b_128);
res12_AB_8x32b = _mm256_maddubs_epi16(src_r12_32x8b, coeffAB_32x8b);
res12_CD_8x32b = _mm256_maddubs_epi16(src_r23_32x8b, coeffCD_32x8b);
res1_8x32b = _mm256_add_epi16(res12_AB_8x32b, res12_CD_8x32b);
res1_8x32b = _mm256_add_epi16(res1_8x32b, round_add32_8x32b);
res1_8x32b = _mm256_srai_epi16(res1_8x32b, 6);
res1_8x32b = _mm256_packus_epi16(res1_8x32b,res1_8x32b);
res1_8x32b = _mm256_permute4x64_epi64(res1_8x32b, 0xD8);
res1 = _mm256_castsi256_si128(res1_8x32b);
_mm_storel_epi64((__m128i *)pu1_dst, res1);
res1 = _mm_srli_si128(res1, 8);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res1);
src_r1_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(src_r23_32x8b,src_r23_32x8b,0x1));;
ht -= 2;
pu1_src += src_strd << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
else // wd == 8
{
WORD32 AB, CD;
__m256i src_r1lh_32x8b, src_r2lh_32x8b;
__m256i res_lh_AB_8x32b, res_lh_CD_8x32b,res_1lh_8x32b,res_2lh_8x32b;
__m256i res_lh_8x32b, res_l_8x32b,res_h_8x32b, res_32x8b;
__m256i coeffAB_32x8b, coeffCD_32x8b, round_add32_8x32b;
__m256i const_shuff_32x8b;
__m256i zero = _mm256_setzero_si256();
__m128i res_1,res_2;
AB = (B << 8) + A;
CD = (D << 8) + C;
coeffAB_32x8b = _mm256_set1_epi16(AB);
coeffCD_32x8b = _mm256_set1_epi16(CD);
round_add32_8x32b = _mm256_set1_epi16(32);
const_shuff_32x8b = _mm256_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806,0x03010200, 0x05030402, 0x07050604, 0x09070806);
src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b);
pu1_src += src_strd;
do
{
//row 1
src_r2lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
src_r2lh_32x8b = _mm256_shuffle_epi8(src_r2lh_32x8b, const_shuff_32x8b);
res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffAB_32x8b);
res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffCD_32x8b);
res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
res_1lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
pu1_src += src_strd;
//row 2
src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b);
res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffAB_32x8b);
res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffCD_32x8b);
res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
res_2lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
res_1lh_8x32b = _mm256_packus_epi16(res_1lh_8x32b, res_2lh_8x32b);
res_1lh_8x32b = _mm256_permute4x64_epi64(res_1lh_8x32b, 0xD8);
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)(pu1_dst),res_1lh_8x32b);
pu1_src += src_strd;
pu1_dst += dst_strd;
pu1_dst += dst_strd;
//row 3
src_r2lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
src_r2lh_32x8b = _mm256_shuffle_epi8(src_r2lh_32x8b, const_shuff_32x8b);
res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffAB_32x8b);
res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffCD_32x8b);
res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
res_1lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
pu1_src += src_strd;
//row 1
src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b);
res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffAB_32x8b);
res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffCD_32x8b);
res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
res_2lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
res_1lh_8x32b = _mm256_packus_epi16(res_1lh_8x32b, res_2lh_8x32b);
res_1lh_8x32b = _mm256_permute4x64_epi64(res_1lh_8x32b, 0xD8);
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)(pu1_dst),res_1lh_8x32b);
ht -= 4;
pu1_src += src_strd;
pu1_dst += dst_strd;
pu1_dst += dst_strd;
}
while(ht > 0);
}
}
/*****************************************************************************/
/* */
/* Function Name : ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2 */
/* */
/* Description : This function implements a six-tap filter vertically and */
/* horizontally on ht x wd block separately and averages */
/* the two sets of values to calculate values at (1/4,1/4), */
/* (1/4, 3/4), (3/4, 1/4) or (3/4, 3/4) as mentioned in */
/* sec. 8.4.2.2.1 titled "Luma sample interpolation */
/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */
/* (16,8), (8,16) or (16,16). */
/* */
/* Inputs : puc_src - pointer to source */
/* puc_dst - pointer to destination */
/* src_strd - stride for source */
/* dst_strd - stride for destination */
/* ht - height of the block */
/* wd - width of the block */
/* pu1_tmp - pointer to temporary buffer */
/* dydx - x and y reference offset for q-pel */
/* calculations */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 13 02 2015 Kaushik Initial Version */
/* Senthoor */
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
/*****************************************************************************/
void ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2(UWORD8 *pu1_src,
UWORD8 *pu1_dst,
WORD32 src_strd,
WORD32 dst_strd,
WORD32 ht,
WORD32 wd,
UWORD8* pu1_tmp,
WORD32 dydx)
{
WORD32 ht_temp;
UWORD8 *pu1_pred_vert,*pu1_pred_horiz;
UWORD8 *pu1_tmp1, *pu1_tmp2;
WORD32 x_offset, y_offset;
__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
__m128i const_val16_8x16b;
__m256i coeff0_1_32x8b, coeff2_3_32x8b, coeff4_5_32x8b,coeff_32x8b;
__m256i const_val16_8x32b;
__m256i zero = _mm256_setzero_si256();
pu1_tmp1 = pu1_tmp;
dydx &= 0xf;
ht_temp = ht;
x_offset = dydx & 0x3;
y_offset = dydx >> 2;
pu1_tmp2 = pu1_tmp1;
pu1_pred_vert = pu1_src + (x_offset >> 1) - 2*src_strd;
pu1_pred_horiz = pu1_src + (y_offset >> 1) * src_strd - 2;
//the filter input starts from x[-2] (till x[3])
coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
const_val16_8x16b = _mm_set1_epi16(16);
coeff_32x8b = _mm256_set_m128i(coeff2_3_16x8b,coeff0_1_16x8b);
coeff0_1_32x8b = _mm256_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
coeff2_3_32x8b = _mm256_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
coeff4_5_32x8b = _mm256_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
const_val16_8x32b = _mm256_set1_epi16(16);
if(wd == 4)
{
//vertical q-pel filter
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
__m128i src_r5_16x8b, src_r6_16x8b;
__m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
__m128i res_r0r1_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
//epilogue: Load all the pred rows except sixth and seventh row for the
//first and second row processing.
src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
//Core Loop: Process all the rows.
do
{
src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_r0r1_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
_mm_storel_epi64((__m128i *)pu1_tmp1, res_r0r1_16x8b);
src_r0_16x8b = src_r2_16x8b;
src_r1_16x8b = src_r3_16x8b;
src_r2_16x8b = src_r4_16x8b;
src_r3_16x8b = src_r5_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht_temp -= 2;
pu1_pred_vert += src_strd << 1;
pu1_tmp1 += 8;
}
while(ht_temp > 0);
}
//horizontal q-pel filter
{
__m128i res_r0r1_16x8b_128, src_r0r1_vpel_16x8b,res_16x8b;
__m256i src_r0r1_sht_32x8b, src_r0r1_32x8b,src_r0r1_t1_32x8b;
__m256i res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b;
__m256i res_r0r1_32x8b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
do
{
src_r0r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)pu1_tmp2);
src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + src_strd), (__m128i *)pu1_pred_horiz);
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1);
src_r0r1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b);
src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
//b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0
src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
//b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0
src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b);
res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b);
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 15;
//a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 15;
//a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 15;
//a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 15;
//b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 15;
//b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 15;
//b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 15;
//b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 15;
res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5); //shifting right by 5 bits.
res_r0r1_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(res_r0r1_t1_8x32b,
res_r0r1_t1_8x32b));
res_16x8b = _mm_avg_epu8(res_r0r1_16x8b_128,src_r0r1_vpel_16x8b);
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
res_16x8b = _mm_srli_si128(res_16x8b, 4);
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
ht -= 2;
pu1_pred_horiz += src_strd << 1;
pu1_tmp2 += 8;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
}
else if(wd == 8)
{
//vertical q-pel filter
{
__m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
__m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b;
__m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
__m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
//epilogue: Load all the pred rows except sixth and seventh row for the
//first and second row processing.
src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
pu1_pred_vert = pu1_pred_vert + src_strd;
src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
//Core Loop: Process all the rows.
do
{
src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
_mm_storel_epi64((__m128i *)(pu1_tmp1), res_16x8b);
src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
_mm_storel_epi64((__m128i *)(pu1_tmp1 + 8), res_16x8b);
src_r0_16x8b = src_r2_16x8b;
src_r1_16x8b = src_r3_16x8b;
src_r2_16x8b = src_r4_16x8b;
src_r3_16x8b = src_r5_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht_temp -= 2;
pu1_pred_vert += src_strd << 1;
pu1_tmp1 += 16;
}
while(ht_temp > 0);
}
//horizontal q-pel filter
{
__m256i src_r0r1_32x8b, src_r0r1_sht_32x8b,src_r0r1_t1_32x8b,res_32x8b;
__m128i src_r0r1_vpel_128,res_16x8b_128;
__m256i src_r0r1_vpel_32x8b,res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
do
{
src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + src_strd), //a2 a3 a4 a5 a6 a7 a8....a15 0 or
(__m128i *)pu1_pred_horiz); //a3 a4 a5 a6 a7 a8 a9....a15 0
//b2 b3 b4 b5 b6 b7 b8....b15 0 or
//b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0r1_vpel_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_tmp2 + 8),
(__m128i *)(pu1_tmp2));
src_r0r1_vpel_32x8b = _mm256_unpacklo_epi64(src_r0r1_vpel_32x8b,zero);
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
//b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
//b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
//b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
//b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
//b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
//b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
//b5 b6 b7 b8 b9....b15 0 0 0 0 0
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
//b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
//b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b);
res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b);
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b);
res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5); //shifting right by 5 bits.
res_32x8b = _mm256_packus_epi16(res_r0r1_t1_8x32b, res_r0r1_t1_8x32b);
res_32x8b = _mm256_avg_epu8(res_32x8b,src_r0r1_vpel_32x8b);
_mm_storel_epi64((__m128i *)(pu1_dst), _mm256_castsi256_si128(res_32x8b));
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), _mm256_castsi256_si128(_mm256_permute2x128_si256(res_32x8b,res_32x8b,0x1)));
ht -= 2;
pu1_pred_horiz += src_strd << 1;
pu1_dst += dst_strd << 1;
pu1_tmp2 += 16;
}
while(ht > 0);
}
}
else // wd == 16
{
//vertical q-pel filter
{
__m256i src_r0r2_32x8b, src_r1r3_32x8b, src_r2r4_32x8b,src_32x8b;
__m128i src_r2_16x8b,src_r4_16x8b,src_r5_16x8b,src_r6_16x8b,src_r4r5_16x8b;
__m256i res_t1_8x32b;
__m128i res_t0_8x16b,res_t3_8x16b,res_t1_8x16b,res_16x8b;
//epilogue: Load all the pred rows except sixth and seventh row for the
//first and second row processing.
src_r0r2_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_vert + 2 * src_strd ),
(__m128i *)(pu1_pred_vert));
src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + 2 * src_strd));
src_r1r3_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_vert + 3 * src_strd ),
(__m128i *)(pu1_pred_vert + src_strd));
src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + 4 * src_strd));
pu1_pred_vert = pu1_pred_vert + (5 * src_strd ) ;
//Core Loop: Process all the rows.
do
{
src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + src_strd));
src_r2r4_32x8b = _mm256_set_m128i(src_r4_16x8b,src_r2_16x8b);
src_32x8b = _mm256_unpacklo_epi8(src_r0r2_32x8b, src_r1r3_32x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256 (res_t1_8x32b,0x1));
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits
src_32x8b = _mm256_unpackhi_epi8(src_r0r2_32x8b, src_r1r3_32x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b),_mm256_extracti128_si256 (res_t1_8x32b,0x1));
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
_mm_storeu_si128((__m128i *)(pu1_tmp1), res_16x8b);
src_32x8b = _mm256_unpacklo_epi8(src_r1r3_32x8b, src_r2r4_32x8b);
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256 (res_t1_8x32b,0x1));
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits
src_32x8b = _mm256_unpackhi_epi8(src_r1r3_32x8b, src_r2r4_32x8b);
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256 (res_t1_8x32b,0x1));
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits
res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
_mm_storeu_si128((__m128i *)(pu1_tmp1+16), res_16x8b);
src_r0r2_32x8b = src_r2r4_32x8b;
src_r1r3_32x8b = _mm256_set_m128i(src_r5_16x8b,_mm256_extracti128_si256(src_r1r3_32x8b,0x1));
src_r2_16x8b = src_r4_16x8b;
src_r4_16x8b = src_r6_16x8b;
ht_temp -= 2;
pu1_pred_vert += src_strd << 1;
pu1_tmp1 += 32;
}
while(ht_temp > 0);
}
//horizontal q-pel filter
{
__m256i src_r0r1_32x8b,src_r0r1_sht_32x8b,src_r0r1_t1_32x8b,res_32x8b;
__m128i src_vpel_16x8b,res_16x8b_128;
__m256i res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b;
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
//Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
//b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
do
{
src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + 8),
(__m128i *)(pu1_pred_horiz)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
//b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
src_vpel_16x8b = _mm_loadu_si128((__m128i *)(pu1_tmp2));
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
//b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
//b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
//b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
//b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
//b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
//b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
//b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
//b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
//b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
//b4 b5 b6 b7 b8 b9....b15 0 0 0 0
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
//b5 b6 b7 b8 b9....b15 0 0 0 0 0
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
//b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
//b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
//b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b);
res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b);
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b);
res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5); //shifting right by 5 bits.
res_32x8b = _mm256_packus_epi16(res_r0r1_t1_8x32b, res_r0r1_t1_8x32b);
res_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(res_32x8b, 0xD8));
res_16x8b_128 = _mm_avg_epu8(res_16x8b_128, src_vpel_16x8b);
_mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b_128);
ht --;
pu1_pred_horiz += src_strd;
pu1_dst += dst_strd;
pu1_tmp2 += 16;
}
while(ht > 0);
}
}
}

View file

@ -0,0 +1,303 @@
/******************************************************************************
*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
*******************************************************************************
* @file
* ih264_iquant_itrans_recon_avx2.c
*
* @brief
* Contains function definitions for inverse quantization, inverse
* transform and reconstruction
*
* @author
* Priyanka Bose
*
* @par List of Functions:
* - ih264_iquant_itrans_recon_4x4_avx2()
*
* @remarks
* None
*
*******************************************************************************
*/
#include <stdio.h>
/* User include files */
#include "ih264_typedefs.h"
#include "ih264_defs.h"
#include "ih264_trans_macros.h"
#include "ih264_macros.h"
#include "ih264_platform_macros.h"
#include "ih264_trans_data.h"
#include "ih264_size_defs.h"
#include "ih264_structs.h"
#include "ih264_trans_quant_itrans_iquant.h"
#include <immintrin.h>
/*
********************************************************************************
*
* @brief This function reconstructs a 4x4 sub block from quantized resiude and
* prediction buffer
*
* @par Description:
* The quantized residue is first inverse quantized, then inverse transformed.
* This inverse transformed content is added to the prediction buffer to recon-
* struct the end output
*
* @param[in] pi2_src
* quantized 4x4 block
*
* @param[in] pu1_pred
* prediction 4x4 block
*
* @param[out] pu1_out
* reconstructed 4x4 block
*
* @param[in] src_strd
* quantization buffer stride
*
* @param[in] pred_strd,
* Prediction buffer stride
*
* @param[in] out_strd
* recon buffer Stride
*
* @param[in] pu2_scaling_list
* pointer to scaling list
*
* @param[in] pu2_norm_adjust
* pointer to inverse scale matrix
*
* @param[in] u4_qp_div_6
* Floor (qp/6)
*
* @param[in] pi4_tmp
* temporary buffer of size 1*16
*
* @returns none
*
* @remarks none
*
*******************************************************************************
*/
void ih264_iquant_itrans_recon_4x4_avx2(WORD16 *pi2_src,
UWORD8 *pu1_pred,
UWORD8 *pu1_out,
WORD32 pred_strd,
WORD32 out_strd,
const UWORD16 *pu2_iscal_mat,
const UWORD16 *pu2_weigh_mat,
UWORD32 u4_qp_div_6,
WORD16 *pi2_tmp,
WORD32 iq_start_idx,
WORD16 *pi2_dc_ld_addr)
{
UWORD32 *pu4_out = (UWORD32 *) pu1_out;
__m256i src_r0_r1, src_r2_r3;
__m256i src_r0, src_r1, src_r2, src_r3;
__m256i scalemat_r0_r1, scalemat_r2_r3;
__m128i pred_r0, pred_r1, pred_r2, pred_r3;
__m256i sign_reg, dequant_r0_r1, dequant_r2_r3;
__m256i zero_8x32b = _mm256_setzero_si256(); // all bits reset to zero
__m256i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__m256i resq_r0, resq_r1, resq_r2, resq_r3;
__m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
__m256i value_32 = _mm256_set1_epi32(32);
__m128i value_32_128 = _mm_set1_epi32(32);
__m128i r0,r1,r2,r3,t0,t1,t2,t3,t4,t5,t6,t7,sign_reg_128, de_r0_r1,de_r2_r3,r0_r1,r2_r3,scale_r0_r1,scale_r2_r3;
__m128i zero_8x16b_128 = _mm_setzero_si128();
UNUSED (pi2_tmp);
/*************************************************************/
/* Dequantization of coefficients. Will be replaced by SIMD */
/* operations on platform */
/*************************************************************/
src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
scalemat_r0_r1 = _mm256_loadu_si256((__m256i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
dequant_r0_r1 = _mm256_loadu_si256((__m256i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
temp0 = _mm256_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
temp4 = _mm256_unpacklo_epi16(temp0, zero_8x32b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
temp5 = _mm256_unpackhi_epi16(temp0, zero_8x32b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
src_r0 = _mm256_unpacklo_epi16(src_r0_r1, zero_8x32b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
src_r1 = _mm256_unpackhi_epi16(src_r0_r1, zero_8x32b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
temp0 = _mm256_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
temp1 = _mm256_madd_epi16(src_r1, temp5);
if (u4_qp_div_6 >= 4) {
resq_r0 = _mm256_slli_epi32(temp0, u4_qp_div_6 - 4);
resq_r1 = _mm256_slli_epi32(temp1, u4_qp_div_6 - 4);
} else {
temp4 = _mm256_add_epi32(temp0, add_rshift);
temp5 = _mm256_add_epi32(temp1, add_rshift);
resq_r0 = _mm256_srai_epi32(temp0, 4 - u4_qp_div_6);
resq_r1 = _mm256_srai_epi32(temp1, 4 - u4_qp_div_6);
}
if (iq_start_idx == 1)
resq_r0 = _mm256_insert_epi32(resq_r0,(WORD32)pi2_dc_ld_addr[0],0);
/* Perform Inverse transform */
/* Perform Inverse transform */
/*-------------------------------------------------------------*/
/* IDCT [ Horizontal transformation ] */
/*-------------------------------------------------------------*/
// Matrix transpose
/*
* a0 a1 a2 a3
* b0 b1 b2 b3
* c0 c1 c2 c3
* d0 d1 d2 d3
*/
temp0 = _mm256_unpacklo_epi32(resq_r0, resq_r1); //a0 c0 a1 c1 b0 d0 b1 d1
temp1 = _mm256_unpackhi_epi32(resq_r0, resq_r1); //a2 c2
resq_r0 = _mm256_permute2f128_si256(temp0, temp1, 0x20);
resq_r1 = _mm256_permute2f128_si256(temp0, temp1, 0x31);
temp0 = _mm256_unpacklo_epi64(resq_r0, resq_r1); // w0 w2
temp1 = _mm256_unpackhi_epi64(resq_r0, resq_r1); // w1 w3
resq_r0 = _mm256_permute2f128_si256(temp0, temp1, 0x20);
resq_r1 = _mm256_permute2f128_si256(temp0, temp1, 0x31);
r0 = _mm256_extracti128_si256(resq_r0, 0x0);
r1 = _mm256_extracti128_si256(resq_r0, 0x1);
r2 = _mm256_extracti128_si256(resq_r1, 0x0);
r3 = _mm256_extracti128_si256(resq_r1, 0x1);
//Transform starts -- horizontal transform
/*------------------------------------------------------------------*/
/* z0 = w0 + w2 */
t0 = _mm_add_epi32(r0, r2);
/* z1 = w0 - w2 */
t1 = _mm_sub_epi32(r0, r2);
/* z2 = (w1 >> 1) - w3 */
t2 = _mm_srai_epi32(r1, 1); //(w1>>1)
t2 = _mm_sub_epi32(t2, r3); //(w1>>1) - w3
/* z3 = w1 + (w3 >> 1) */
t3 = _mm_srai_epi32(r3, 1); //(w3>>1) + w1
t3 = _mm_add_epi32(t3, r1);
/*----------------------------------------------------------*/
/* x0 = z0 + z3 */
r0 = _mm_add_epi32(t0, t3);
/* x1 = z1 + z2 */
r1 = _mm_add_epi32(t1, t2);
/* x2 = z1 - z2 */
r2 = _mm_sub_epi32(t1, t2);
/* x3 = z0 - z3 */
r3 = _mm_sub_epi32(t0, t3);
t1 = _mm_unpacklo_epi32(r0, r1); //a0 a1 b0 b1
t3 = _mm_unpacklo_epi32(r2, r3); //a2 a3 b2 b3
t2 = _mm_unpackhi_epi32(r0, r1); //c0 c1 d0 d1
t4 = _mm_unpackhi_epi32(r2, r3); //c2 c3 d2 d3
r0 = _mm_unpacklo_epi64(t1, t3); //a0 a1 a2 a3
r1 = _mm_unpackhi_epi64(t1, t3); //b0 b1 b2 b3
r2 = _mm_unpacklo_epi64(t2, t4); //c0 c1 c2 c3
r3 = _mm_unpackhi_epi64(t2, t4); //d0 d1 d2 d3
//Transform ends -- horizontal transform
//Load pred buffer
pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
pred_r0 = _mm_cvtepu8_epi32(pred_r0); //p00 p01 p02 p03 -- all 32 bits
pred_r1 = _mm_cvtepu8_epi32(pred_r1); //p10 p11 p12 p13 -- all 32 bits ///Need to look
pred_r2 = _mm_cvtepu8_epi32(pred_r2); //p20 p21 p22 p23 -- all 32 bits
pred_r3 = _mm_cvtepu8_epi32(pred_r3); //p30 p31 p32 p33 -- all 32 bits
/*--------------------------------------------------------------*/
/* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
/* */
/* Add the prediction and store it back to same buffer */
/*--------------------------------------------------------------*/
t0 = _mm_add_epi32(r0, r2);
/* z1j = y0j - y2j */
t1 = _mm_sub_epi32(r0, r2);
/* z2j = (y1j>>1) - y3j */
t2 = _mm_srai_epi32(r1, 1); //(y1j>>1)
t2 = _mm_sub_epi32(t2, r3);
/* z3j = y1j + (y3j>>1) */
t3 = _mm_srai_epi32(r3, 1); //(y3j>>1)
t3 = _mm_add_epi32(r1, t3);
t4 = _mm_add_epi32(t0, t3);
t4 = _mm_add_epi32(t4, value_32_128);
t4 = _mm_srai_epi32(t4, 6);
t4 = _mm_add_epi32(t4, pred_r0);
t5 = _mm_add_epi32(t1, t2);
t5 = _mm_add_epi32(t5, value_32_128);
t5 = _mm_srai_epi32(t5, 6);
t5 = _mm_add_epi32(t5, pred_r1);
t6 = _mm_sub_epi32(t1, t2);
t6 = _mm_add_epi32(t6, value_32_128);
t6 = _mm_srai_epi32(t6, 6);
t6 = _mm_add_epi32(t6, pred_r2);
t7 = _mm_sub_epi32(t0, t3);
t7 = _mm_add_epi32(t7, value_32_128);
t7 = _mm_srai_epi32(t7, 6);
t7 = _mm_add_epi32(t7, pred_r3);
// 32-bit to 16-bit conversion
t0 = _mm_packs_epi32(t4, t5);
t1 = _mm_packs_epi32(t6, t7);
//Clipping the results to 8 bits
sign_reg_128 = _mm_cmpgt_epi16(t0, zero_8x16b_128); // sign check
t0 = _mm_and_si128(t0, sign_reg_128);
sign_reg_128 = _mm_cmpgt_epi16(t1, zero_8x16b_128);
t1 = _mm_and_si128(t1, sign_reg_128);
r0 = _mm_packus_epi16(t0, t1);
r1 = _mm_srli_si128(r0, 4);
r2 = _mm_srli_si128(r1, 4);
r3 = _mm_srli_si128(r2, 4);
*pu4_out = _mm_cvtsi128_si32(r0);
pu1_out += out_strd;
pu4_out = (UWORD32 *) (pu1_out);
*(pu4_out) = _mm_cvtsi128_si32(r1);
pu1_out += out_strd;
pu4_out = (UWORD32 *) (pu1_out);
*(pu4_out) = _mm_cvtsi128_si32(r2);
pu1_out += out_strd;
pu4_out = (UWORD32 *) (pu1_out);
*(pu4_out) = _mm_cvtsi128_si32(r3);
}

View file

@ -0,0 +1,501 @@
/******************************************************************************
*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/*****************************************************************************/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
#include <immintrin.h>
#include "ih264_typedefs.h"
#include "ih264_macros.h"
#include "ih264_platform_macros.h"
#include "ih264_weighted_pred.h"
#include <stdint.h>
#include <string.h>
#include <stdio.h>
/*****************************************************************************/
/* */
/* Function Name : ih264_weighted_bi_pred_luma_avx2 */
/* */
/* Description : This function performs the weighted biprediction as */
/* described in sec 8.4.2.3.2 titled "Weighted sample */
/* prediction process" for luma. The function gets two */
/* ht x wd blocks, weights them, adds them, rounds off the */
/* sum, offsets it, saturates it to unsigned 8-bit and */
/* stores it in the destination block. (ht,wd) can be */
/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
/* */
/* Inputs : pu1_src1 - Pointer to source 1 */
/* pu1_src2 - Pointer to source 2 */
/* pu1_dst - Pointer to destination */
/* src_strd1 - stride for source 1 */
/* src_strd2 - stride for source 2 */
/* dst_strd2 - stride for destination */
/* log_wd - number of bits to be rounded off */
/* wt1 - weight value for source 1 */
/* wt2 - weight value for source 2 */
/* ofst1 - offset value for source 1 */
/* ofst2 - offset value for source 2 */
/* ht - height of the block */
/* wd - width of the block */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 04 02 2015 Kaushik Initial Version */
/* Senthoor */
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
/*****************************************************************************/
void ih264_weighted_bi_pred_luma_avx2(UWORD8 *pu1_src1,
UWORD8 *pu1_src2,
UWORD8 *pu1_dst,
WORD32 src_strd1,
WORD32 src_strd2,
WORD32 dst_strd,
WORD32 log_wd,
WORD32 wt1,
WORD32 wt2,
WORD32 ofst1,
WORD32 ofst2,
WORD32 ht,
WORD32 wd)
{
__m256i wt1_8x32b, wt2_8x32b;
__m256i ofst_8x32b, round_8x32b;
__m256i zero;
zero = _mm256_set1_epi8(0);
WORD32 ofst;
WORD32 round_val, shft;
wt1 = (WORD16)(wt1 & 0xffff);
wt2 = (WORD16)(wt2 & 0xffff);
round_val = 1 << log_wd;
shft = log_wd + 1;
ofst1 = (WORD8)(ofst1 & 0xff);
ofst2 = (WORD8)(ofst2 & 0xff);
ofst = (ofst1 + ofst2 + 1) >> 1;
wt1_8x32b = _mm256_set1_epi16(wt1);
wt2_8x32b = _mm256_set1_epi16(wt2);
round_8x32b = _mm256_set1_epi16(round_val);
ofst_8x32b = _mm256_set1_epi16(ofst);
if(wd == 4)
{
__m128i y1_2_16x8b, y1_3_16x8b;
__m128i y2_2_16x8b, y2_3_16x8b;
__m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b,y2_1_8x32b,y2_0_8x32b;
__m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_1_16x8b_128,y1_2_16x8b_128,y1_3_16x8b_128;
do
{
y1_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1));
y1_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1));
y2_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2));
y2_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2));
y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);
y1_0_32x8b = _mm256_unpacklo_epi32(y1_02_32x8b, y1_13_32x8b);
y2_0_32x8b = _mm256_unpacklo_epi32(y2_02_32x8b, y2_13_32x8b);
y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y1_0_32x8b, 0xD8));
y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y2_0_32x8b, 0xD8));
y1_0_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128); // 8 to 16
y2_0_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);
y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);
y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);
y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);
y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);
y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(y1_0_8x32b, y1_0_8x32b));
y1_2_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 4);
y1_1_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 8);
y1_3_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 12);
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b_128);
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b_128);
*((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b_128);
*((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b_128);
ht -= 4;
pu1_src1 += src_strd1 << 2;
pu1_src2 += src_strd2 << 2;
pu1_dst += dst_strd << 2;
}
while(ht > 0);
}
else if(wd == 8)
{
__m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_2_16x8b_128,y1_1_16x8b_128,y1_3_16x8b_128;
__m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b;
__m256i y1_1_8x32b,y2_0_8x32b,y2_1_8x32b;
do
{
y1_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1));
y1_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1));
y2_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2));
y2_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2));
y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);
y1_0_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, y1_13_32x8b);
y2_0_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, y2_13_32x8b);
y1_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y1_0_32x8b));
y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,0x1));
y1_1_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128);
y2_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y2_0_32x8b));
y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y2_0_32x8b,y2_0_32x8b,0x1));
y2_1_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);
y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);
y1_1_8x32b = _mm256_mullo_epi16(y1_1_8x32b, wt1_8x32b);
y2_1_8x32b = _mm256_mullo_epi16(y2_1_8x32b, wt2_8x32b);
y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);
y1_1_8x32b = _mm256_adds_epi16(y1_1_8x32b, y2_1_8x32b);
y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);
y1_1_8x32b = _mm256_srai_epi16(y1_1_8x32b, shft);
y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);
y1_1_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_1_8x32b);
y1_0_32x8b = _mm256_packus_epi16(y1_0_8x32b, y1_1_8x32b);
y1_0_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
y1_2_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));
y1_0_32x8b = _mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,1);
y1_1_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
y1_3_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));
_mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b_128);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b_128);
_mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b_128);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b_128);
ht -= 4;
pu1_src1 += src_strd1 << 2;
pu1_src2 += src_strd2 << 2;
pu1_dst += dst_strd << 2;
}
while(ht > 0);
}
else // wd == 16
{
__m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
__m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;
__m256i zero_32x8b,y1_0_32x8b,y2_0_32x8b;
zero_32x8b = _mm256_set1_epi8(0);
do
{
y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);
y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);
y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b,zero_32x8b);
y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);
y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);
y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);
y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);
y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);
y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);
y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);
y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);
_mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);
ht -= 2;
pu1_src1 += src_strd1 << 1;
pu1_src2 += src_strd2 << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
}
/*****************************************************************************/
/* */
/* Function Name : ih264_weighted_bi_pred_chroma_avx2 */
/* */
/* Description : This function performs the weighted biprediction as */
/* described in sec 8.4.2.3.2 titled "Weighted sample */
/* prediction process" for chroma. The function gets two */
/* ht x wd blocks, weights them, adds them, rounds off the */
/* sum, offsets it, saturates it to unsigned 8-bit and */
/* stores it in the destination block. (ht,wd) can be */
/* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */
/* */
/* Inputs : pu1_src1 - Pointer to source 1 */
/* pu1_src2 - Pointer to source 2 */
/* pu1_dst - Pointer to destination */
/* src_strd1 - stride for source 1 */
/* src_strd2 - stride for source 2 */
/* dst_strd2 - stride for destination */
/* log_wd - number of bits to be rounded off */
/* wt1 - weight values for u and v in source 1 */
/* wt2 - weight values for u and v in source 2 */
/* ofst1 - offset value for u and v in source 1 */
/* ofst2 - offset value for u and v in source 2 */
/* ht - height of the block */
/* wd - width of the block */
/* */
/* Issues : None */
/* */
/* Revision History: */
/* */
/* DD MM YYYY Author(s) Changes */
/* 04 02 2015 Kaushik Initial Version */
/* Senthoor */
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
/*****************************************************************************/
void ih264_weighted_bi_pred_chroma_avx2(UWORD8 *pu1_src1,
UWORD8 *pu1_src2,
UWORD8 *pu1_dst,
WORD32 src_strd1,
WORD32 src_strd2,
WORD32 dst_strd,
WORD32 log_wd,
WORD32 wt1,
WORD32 wt2,
WORD32 ofst1,
WORD32 ofst2,
WORD32 ht,
WORD32 wd)
{
__m128i y1_0_16x8b, y1_1_16x8b;
__m128i y2_0_16x8b, y2_1_16x8b;
__m128i wt1_8x16b, wt2_8x16b;
__m128i ofst_8x16b, round_8x16b;
WORD32 ofst1_u, ofst2_u, ofst_u;
WORD32 ofst1_v, ofst2_v, ofst_v;
WORD32 round_val, shft, ofst_val,ofst_val_256;
round_val = 1 << log_wd;
shft = log_wd + 1;
ofst1_u = (WORD8)(ofst1 & 0xff);
ofst1_v = (WORD8)(ofst1 >> 8);
ofst2_u = (WORD8)(ofst2 & 0xff);
ofst2_v = (WORD8)(ofst2 >> 8);
wt1_8x16b = _mm_set1_epi32(wt1);
wt2_8x16b = _mm_set1_epi32(wt2);
ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
ofst_val_256 = (ofst_u & 0xffff) | (ofst_v << 16);
round_8x16b = _mm_set1_epi16(round_val);
ofst_8x16b = _mm_set1_epi32(ofst_val);
if(wd == 2)
{
__m128i y1_0_8x16b, y2_0_8x16b;
do
{
y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); //Loading 64 bits from diff location
y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
ht -= 2;
pu1_src1 += src_strd1 << 1;
pu1_src2 += src_strd2 << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
else if(wd == 4)
{
__m128i y1_0_8x16b, y1_1_8x16b;
__m128i y2_0_8x16b, y2_1_8x16b;
do
{
y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); //Loading 64 bits from diff location
y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
_mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
ht -= 2;
pu1_src1 += src_strd1 << 1;
pu1_src2 += src_strd2 << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
else // wd == 8
{
__m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
__m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;
__m256i y1_0_32x8b,y2_0_32x8b,ofst_8x32b,round_8x32b;
__m256i wt1_8x32b, wt2_8x32b;
__m256i zero_32x8b;
wt1_8x32b = _mm256_set1_epi16(wt1);
wt2_8x32b = _mm256_set1_epi16(wt2);
round_8x32b = _mm256_set1_epi16(round_val);
ofst_8x32b = _mm256_set1_epi32(ofst_val_256);
zero_32x8b = _mm256_set1_epi8(0);
do
{
y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);
y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);
y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b, zero_32x8b);
y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);
y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);
y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);
y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);
y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);
y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);
y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);
y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);
_mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);
ht -= 2;
pu1_src1 += src_strd1 << 1;
pu1_src2 += src_strd2 << 1;
pu1_dst += dst_strd << 1;
}
while(ht > 0);
}
}

View file

@ -67,5 +67,6 @@ void ih264d_init_function_ptr_sse42(dec_struct_t *ps_codec);
void ih264d_init_function_ptr_a9q(dec_struct_t *ps_codec);
void ih264d_init_function_ptr_av8(dec_struct_t *ps_codec);
void ih264d_init_function_ptr_avx2(dec_struct_t *ps_codec);
#endif /* _IH264D_FUNCTION_SELECTOR_H_ */

View file

@ -46,7 +46,8 @@ else()
list(
APPEND LIBAVCDEC_SRCS "${AVC_ROOT}/decoder/x86/ih264d_function_selector.c"
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_sse42.c"
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c")
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c"
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c")
endif()
add_library(libavcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS}

View file

@ -54,7 +54,8 @@ else()
list(
APPEND LIBMVCDEC_ASMS "${AVC_ROOT}/decoder/x86/ih264d_function_selector.c"
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_sse42.c"
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c")
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c"
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c")
endif()
add_library(libmvcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS}

View file

@ -95,7 +95,8 @@ else()
"${AVC_ROOT}/decoder/x86/svc/isvcd_iquant_itrans_residual_sse42.c"
"${AVC_ROOT}/decoder/x86/svc/isvcd_iquant_itrans_sse42.c"
"${AVC_ROOT}/decoder/x86/svc/isvcd_pred_residual_recon_sse42.c"
"${AVC_ROOT}/decoder/x86/svc/isvcd_residual_resamp_sse42.c")
"${AVC_ROOT}/decoder/x86/svc/isvcd_residual_resamp_sse42.c"
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c")
endif()
add_library(libsvcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS}

View file

@ -52,6 +52,7 @@
#include "ih264_error.h"
#include "ih264_trans_quant_itrans_iquant.h"
#include "ih264_inter_pred_filters.h"
#include "ih264_deblk_edge_filters.h"
#include "ih264d_structs.h"
#include "ih264d_function_selector.h"
@ -68,6 +69,11 @@ void ih264d_init_function_ptr(dec_struct_t *ps_codec)
case ARCH_X86_SSSE3:
ih264d_init_function_ptr_ssse3(ps_codec);
break;
case ARCH_X86_AVX2:
ih264d_init_function_ptr_ssse3(ps_codec);
ih264d_init_function_ptr_sse42(ps_codec);
ih264d_init_function_ptr_avx2(ps_codec);
break;
case ARCH_X86_SSE42:
default:
ih264d_init_function_ptr_ssse3(ps_codec);
@ -83,7 +89,7 @@ void ih264d_init_arch(dec_struct_t *ps_codec)
#elif DEFAULT_ARCH == D_ARCH_X86_SSSE3
ps_codec->e_processor_arch = ARCH_X86_SSSE3;
#elif DEFAULT_ARCH == D_ARCH_X86_AVX2
ps_codec->e_processor_arch = D_ARCH_X86_AVX2;
ps_codec->e_processor_arch = ARCH_X86_AVX2;
#else
ps_codec->e_processor_arch = ARCH_X86_GENERIC;
#endif

View file

@ -0,0 +1,102 @@
/******************************************************************************
*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*****************************************************************************
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
*******************************************************************************
* @file
* ih264e_function_selector_generic.c
*
* @brief
* Contains functions to initialize function pointers of codec context
*
* @author
* Ittiam
*
* @par List of Functions:
* - ih264e_init_function_ptr_generic
*
* @remarks
* None
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes */
/*****************************************************************************/
/* System Include files */
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
/* User Include files */
#include "ih264_typedefs.h"
#include "iv.h"
#include "ivd.h"
#include "ih264_defs.h"
#include "ih264_size_defs.h"
#include "ih264_error.h"
#include "ih264_trans_quant_itrans_iquant.h"
#include "ih264_inter_pred_filters.h"
#include "ih264_trans_quant_itrans_iquant.h"
#include "ih264_weighted_pred.h"
#include "ih264d_structs.h"
#include "ih264_deblk_edge_filters.h"
/**
*******************************************************************************
*
* @brief Initialize the intra/inter/transform/deblk function pointers of
* codec context
*
* @par Description: the current routine initializes the function pointers of
* codec context basing on the architecture in use
*
* @param[in] ps_codec
* Codec context pointer
*
* @returns none
*
* @remarks none
*
*******************************************************************************
*/
void ih264d_init_function_ptr_avx2(dec_struct_t *ps_codec)
{
UNUSED(ps_codec);
ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_avx2;
ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_avx2;
ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_avx2;
//ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_avx2;
ps_codec->apf_inter_pred_luma[5] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
ps_codec->apf_inter_pred_luma[7] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
ps_codec->apf_inter_pred_luma[13] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
ps_codec->apf_inter_pred_luma[15] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_avx2;
ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_avx2;
ps_codec->apf_inter_pred_luma[0] = ih264_inter_pred_luma_copy_avx2;
ps_codec->pf_iquant_itrans_recon_luma_4x4 = ih264_iquant_itrans_recon_4x4_avx2;
ps_codec->pf_weighted_bi_pred_luma = ih264_weighted_bi_pred_luma_avx2;
ps_codec->pf_weighted_bi_pred_chroma = ih264_weighted_bi_pred_chroma_avx2;
return;
}