mirror of
https://github.com/ittiam-systems/libavc.git
synced 2026-04-02 20:30:48 +07:00
avx2 intrinsics support for libavc(sw decoder)
Some checks failed
CMake / build (push) Has been cancelled
Some checks failed
CMake / build (push) Has been cancelled
This patch includes handwritten avx2 intrinsics to optimize the libavc sw decoder by reducing CPU-cycles overhead on module : libcodec2_soft_avcdec. Playing 1024 resolution video playback on the Galley App with HW decoder disabled: cpu-cycles overhead(%) reduced by ~15%. Loading of video thumbnails on Gallery/Photos App is faster (we have pushed approx more than 30 videos as a part of the usecase): cpu-cycles overhead(%) have reduced by ~10%.This patch is related to s/w video decoding. Signed-off-by: Priyanka Bose <priyanka.bose@intel.corp-partner.google.com>
This commit is contained in:
parent
9331596eef
commit
ca7461442c
19 changed files with 2817 additions and 8 deletions
57
Android.bp
57
Android.bp
|
|
@ -236,6 +236,56 @@ cc_defaults {
|
|||
},
|
||||
}
|
||||
|
||||
cc_library_static {
|
||||
name: "libavc_avx2",
|
||||
defaults: ["libavc_dec_defaults"],
|
||||
visibility: ["//visibility:private"],
|
||||
export_include_dirs: [
|
||||
"common",
|
||||
"decoder",
|
||||
],
|
||||
arch: {
|
||||
x86: {
|
||||
cflags: [
|
||||
"-mavx2",
|
||||
"-mfma",
|
||||
],
|
||||
srcs: [
|
||||
"common/x86/ih264_ihadamard_scaling_avx2.c",
|
||||
"common/x86/ih264_deblk_chroma_avx2.c",
|
||||
"common/x86/ih264_deblk_luma_avx2.c",
|
||||
"common/x86/ih264_iquant_itrans_recon_avx2.c",
|
||||
"common/x86/ih264_weighted_pred_avx2.c",
|
||||
"common/x86/ih264_inter_pred_filters_avx2.c",
|
||||
"decoder/x86/ih264d_function_selector_avx2.c",
|
||||
],
|
||||
},
|
||||
x86_64: {
|
||||
cflags: [
|
||||
"-mavx2",
|
||||
"-mfma",
|
||||
],
|
||||
srcs: [
|
||||
"common/x86/ih264_ihadamard_scaling_avx2.c",
|
||||
"common/x86/ih264_deblk_chroma_avx2.c",
|
||||
"common/x86/ih264_deblk_luma_avx2.c",
|
||||
"common/x86/ih264_iquant_itrans_recon_avx2.c",
|
||||
"common/x86/ih264_weighted_pred_avx2.c",
|
||||
"common/x86/ih264_inter_pred_filters_avx2.c",
|
||||
"decoder/x86/ih264d_function_selector_avx2.c",
|
||||
],
|
||||
},
|
||||
},
|
||||
sanitize: {
|
||||
blocklist: "libavc_blocklist.txt",
|
||||
},
|
||||
apex_available: [
|
||||
"//apex_available:platform", // used by libstagefright_soft_avcdec
|
||||
"com.android.media.swcodec",
|
||||
],
|
||||
min_sdk_version: "29",
|
||||
}
|
||||
|
||||
cc_library_static {
|
||||
name: "libavcdec",
|
||||
defaults: ["libavc_dec_defaults"],
|
||||
|
|
@ -380,6 +430,10 @@ cc_library_static {
|
|||
"decoder/x86/ih264d_function_selector_sse42.c",
|
||||
"decoder/x86/ih264d_function_selector_ssse3.c",
|
||||
],
|
||||
whole_static_libs: [
|
||||
"libavc_avx2",
|
||||
],
|
||||
|
||||
},
|
||||
|
||||
x86_64: {
|
||||
|
|
@ -400,6 +454,9 @@ cc_library_static {
|
|||
"decoder/x86/ih264d_function_selector_sse42.c",
|
||||
"decoder/x86/ih264d_function_selector_ssse3.c",
|
||||
],
|
||||
whole_static_libs: [
|
||||
"libavc_avx2",
|
||||
],
|
||||
},
|
||||
},
|
||||
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ function(libavc_add_compile_options)
|
|||
elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch32")
|
||||
add_compile_options(-march=armv7-a -mfpu=neon)
|
||||
else()
|
||||
add_compile_options(-msse4.2 -mno-avx)
|
||||
add_compile_options(-msse4.2 -mavx2 -mfma)
|
||||
endif()
|
||||
add_compile_options(-Wdeclaration-after-statement)
|
||||
|
||||
|
|
@ -45,8 +45,8 @@ function(libavc_add_definitions)
|
|||
elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch32")
|
||||
add_definitions(-DARMV7 -DDEFAULT_ARCH=D_ARCH_ARM_A9Q)
|
||||
else()
|
||||
add_definitions(-DX86 -DX86_LINUX=1 -DDISABLE_AVX2
|
||||
-DDEFAULT_ARCH=D_ARCH_X86_SSE42)
|
||||
add_definitions(-DX86 -DX86_LINUX=1
|
||||
-DDEFAULT_ARCH=D_ARCH_X86_SSE42)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
|
|
|||
|
|
@ -108,7 +108,13 @@ else()
|
|||
"${AVC_ROOT}/common/x86/ih264_mem_fns_ssse3.c"
|
||||
"${AVC_ROOT}/common/x86/ih264_padding_ssse3.c"
|
||||
"${AVC_ROOT}/common/x86/ih264_resi_trans_quant_sse42.c"
|
||||
"${AVC_ROOT}/common/x86/ih264_weighted_pred_sse42.c")
|
||||
"${AVC_ROOT}/common/x86/ih264_weighted_pred_sse42.c"
|
||||
"${AVC_ROOT}/common/x86/ih264_ihadamard_scaling_avx2.c"
|
||||
"${AVC_ROOT}/common/x86/ih264_deblk_chroma_avx2.c"
|
||||
"${AVC_ROOT}/common/x86/ih264_deblk_luma_avx2.c"
|
||||
"${AVC_ROOT}/common/x86/ih264_iquant_itrans_recon_avx2.c"
|
||||
"${AVC_ROOT}/common/x86/ih264_weighted_pred_avx2.c"
|
||||
"${AVC_ROOT}/common/x86/ih264_inter_pred_filters_avx2.c")
|
||||
|
||||
include_directories(${AVC_ROOT}/common/x86)
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -159,4 +159,12 @@ ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_ssse3;
|
|||
ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_ssse3;
|
||||
ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff_ssse3;
|
||||
|
||||
|
||||
/* AVX2 */
|
||||
ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_avx2;
|
||||
ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_avx2;
|
||||
ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4_avx2;
|
||||
ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_avx2;
|
||||
|
||||
|
||||
#endif /* _IH264_DEBLK_EDGE_FILTERS_H_ */
|
||||
|
|
|
|||
|
|
@ -138,4 +138,11 @@ ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3;
|
|||
ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3;
|
||||
ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_ssse3;
|
||||
|
||||
/* AVX2 Intrinsic Declarations */
|
||||
|
||||
ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_avx2;
|
||||
ih264_inter_pred_luma_ft ih264_inter_pred_luma_copy_avx2;
|
||||
ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
|
||||
|
||||
|
||||
#endif /* _IH264_INTER_PRED_FILTERS_H_ */
|
||||
|
|
|
|||
|
|
@ -231,4 +231,10 @@ ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_sse42;
|
|||
ih264_hadamard_quant_ft ih264_hadamard_quant_4x4_sse42;
|
||||
ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv_sse42;
|
||||
|
||||
/*AVX2 Declarations*/
|
||||
ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_avx2;
|
||||
ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv_avx2;
|
||||
ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_avx2;
|
||||
|
||||
|
||||
#endif /* _IH264_TRANS_QUANT_ITRANS_IQUANT_H_ */
|
||||
|
|
|
|||
|
|
@ -106,5 +106,10 @@ ih264_weighted_pred_ft ih264_weighted_pred_chroma_sse42;
|
|||
ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_sse42;
|
||||
ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_sse42;
|
||||
|
||||
|
||||
/* AVX2 */
|
||||
ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_avx2;
|
||||
ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_avx2;
|
||||
|
||||
#endif /* _IH264_WEIGHTED_PRED_H_ */
|
||||
|
||||
|
|
|
|||
386
common/x86/ih264_deblk_chroma_avx2.c
Normal file
386
common/x86/ih264_deblk_chroma_avx2.c
Normal file
|
|
@ -0,0 +1,386 @@
|
|||
/******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2015 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at:
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*****************************************************************************
|
||||
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
||||
*/
|
||||
/*****************************************************************************/
|
||||
|
||||
/*****************************************************************************/
|
||||
/* File Includes */
|
||||
/*****************************************************************************/
|
||||
|
||||
/* System include files */
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __ANDROID__
|
||||
#include "log/log.h"
|
||||
#include <cutils/log.h>
|
||||
#endif
|
||||
|
||||
/* User include files */
|
||||
#include "ih264_typedefs.h"
|
||||
#include "ih264_platform_macros.h"
|
||||
#include "ih264_deblk_edge_filters.h"
|
||||
#include "ih264_macros.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
|
||||
|
||||
/*****************************************************************************/
|
||||
/* */
|
||||
/* Function Name : ih264_deblk_chroma_vert_bslt4_avx2() */
|
||||
/* */
|
||||
/* Description : This function performs filtering of a chroma block */
|
||||
/* vertical edge when the boundary strength is less than 4 */
|
||||
/* in high profile. */
|
||||
/* */
|
||||
/* Inputs : pu1_src - pointer to the src sample q0 of U */
|
||||
/* src_strd - source stride */
|
||||
/* alpha_cb - alpha value for the boundary in U */
|
||||
/* beta_cb - beta value for the boundary in U */
|
||||
/* alpha_cr - alpha value for the boundary in V */
|
||||
/* beta_cr - beta value for the boundary in V */
|
||||
/* u4_bs - packed Boundary strength array */
|
||||
/* pu1_cliptab_cb - tc0_table for U */
|
||||
/* pu1_cliptab_cr - tc0_table for V */
|
||||
/* */
|
||||
/* Globals : None */
|
||||
/* */
|
||||
/* Processing : This operation is described in Sec. 8.7.2.3 under the */
|
||||
/* title "Filtering process for edges for bS less than 4" */
|
||||
/* in ITU T Rec H.264 with alpha and beta values different */
|
||||
/* in U and V. */
|
||||
/* */
|
||||
/* Outputs : None */
|
||||
/* */
|
||||
/* Returns : None */
|
||||
/* */
|
||||
/* Issues : None */
|
||||
/* */
|
||||
/* Revision History: */
|
||||
/* */
|
||||
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||||
/* 12 02 2015 Naveen Kumar P Initial version */
|
||||
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
|
||||
/*****************************************************************************/
|
||||
|
||||
void ih264_deblk_chroma_vert_bslt4_avx2(UWORD8 *pu1_src,
|
||||
WORD32 src_strd,
|
||||
WORD32 alpha_cb,
|
||||
WORD32 beta_cb,
|
||||
WORD32 alpha_cr,
|
||||
WORD32 beta_cr,
|
||||
UWORD32 u4_bs,
|
||||
const UWORD8 *pu1_cliptab_cb,
|
||||
const UWORD8 *pu1_cliptab_cr)
|
||||
{
|
||||
UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
|
||||
UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
|
||||
WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
|
||||
WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
|
||||
__m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
|
||||
__m256i lineab, linecd, lineef, linegh, lineae, linebf, linecg, linedh;
|
||||
__m256i temp1, temp2, temp3, temp4;
|
||||
__m256i t1,t3, t2,t4,pq0_uv_32x8,pq1_uv_32x8,tmp1,tmp2,p0_uv_8x32,q0_uv_8x32;
|
||||
|
||||
__m256i pq0_uv_8x32, pq1_uv_8x32, p1_uv_8x32,pq0_uv_8x32_1,pq0_uv_8x32_2;
|
||||
__m256i flag_bs, flag1, flag2;
|
||||
__m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
|
||||
__m256i zero = _mm256_setzero_si256();
|
||||
__m256i C0_uv_8x32;
|
||||
__m256i p0_uv_8x32_1, p0_uv_8x32_2, q0_uv_8x32_1, q0_uv_8x32_2,p0_uv_32x8_1,q0_uv_32x8_1;
|
||||
|
||||
u1_Bs0 = (u4_bs >> 24) & 0xff;
|
||||
u1_Bs1 = (u4_bs >> 16) & 0xff;
|
||||
u1_Bs2 = (u4_bs >> 8) & 0xff;
|
||||
u1_Bs3 = (u4_bs >> 0) & 0xff;
|
||||
|
||||
flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
|
||||
u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
|
||||
u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
|
||||
u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
|
||||
flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
|
||||
flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask
|
||||
|
||||
/* Load and transpose the pixel values */
|
||||
lineab = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + src_strd), (__m128i *)(pu1_src_uv - 4));
|
||||
linecd = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), (__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
|
||||
lineef = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), (__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
|
||||
linegh = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), (__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
|
||||
|
||||
temp1 = _mm256_unpacklo_epi64(lineab, zero); //a0 -- a7 000.. b0..b7 000
|
||||
temp2 = _mm256_unpacklo_epi64(linecd, zero);
|
||||
temp3 = _mm256_unpacklo_epi64(lineef, zero); //e0 -- e7 000.. f0..f7 000
|
||||
temp4 = _mm256_unpacklo_epi64(linegh, zero);
|
||||
|
||||
temp1 = _mm256_unpacklo_epi16(temp1, temp2); //a0 a1 c0 c1 -- a6 a7 c6 c7 b0 b1 d0 d1.. b6 b7 d6 d7
|
||||
temp2 = _mm256_unpacklo_epi16(temp3, temp4); //e0 e1 g0 g1 f0 f1 h0 h1
|
||||
|
||||
t2 = _mm256_permute2f128_si256(temp1, temp2, 0x20);
|
||||
t3 = _mm256_permute2f128_si256(temp1, temp2, 0x31);
|
||||
|
||||
tmp1 = _mm256_unpacklo_epi16(t2, t3); //a0 a1 b0 b1 c0 c1 d0 d1 -a2 a3 b2 b3 .... e0 e1 f0 f1 g0 g1 h0 h1 -e2 e3..
|
||||
tmp2 = _mm256_unpackhi_epi16(t2, t3); //a4 a5 b4 b5 -a6 a7 b6 b7
|
||||
|
||||
|
||||
temp1 = _mm256_unpacklo_epi8(tmp1,zero); // a0 0 a1 0 b0 0 b1 0 c0 0 c1 0 d0 0 d1 0 - e0 0 e1 0 .. => p1
|
||||
temp2 = _mm256_unpackhi_epi8(tmp1,zero); // a2 0 a3 0 => p0
|
||||
temp3 = _mm256_unpacklo_epi8(tmp2,zero); //a4 0 a5 0 => q0
|
||||
temp4 = _mm256_unpackhi_epi8(tmp2,zero); //a6 0 a7 0 => q1
|
||||
|
||||
pq1_uv_32x8 = _mm256_packus_epi16(temp1,temp4); // 0213
|
||||
pq0_uv_32x8 = _mm256_packus_epi16(temp2,temp3); //0213
|
||||
|
||||
diff = _mm256_subs_epi16(temp2, temp3); //Condn 1 (p0 -q0) - set (3), set(3)
|
||||
diff = _mm256_abs_epi16(diff);
|
||||
alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
|
||||
flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);
|
||||
|
||||
diff = _mm256_subs_epi16(temp4, temp3); //Condtn 2 (q1 -q0)
|
||||
diff = _mm256_abs_epi16(diff);
|
||||
beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
|
||||
flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
|
||||
|
||||
|
||||
diff = _mm256_subs_epi16(temp1, temp2); //Condtn 3 (p1 -p0)
|
||||
diff = _mm256_abs_epi16(diff);
|
||||
flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
|
||||
|
||||
diff = _mm256_subs_epi16(temp3, temp2); //(q0 -p0)
|
||||
diff = _mm256_slli_epi16(diff, 2);
|
||||
|
||||
diff1 = _mm256_subs_epi16(temp1, temp4); //(p1 -q1)
|
||||
diff = _mm256_add_epi16(diff, diff1);
|
||||
|
||||
diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
|
||||
in_macro = _mm256_srai_epi16(diff, 3);
|
||||
|
||||
|
||||
C0_uv_8x32 = _mm256_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
|
||||
pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
|
||||
pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
|
||||
pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
|
||||
pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
|
||||
pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
|
||||
pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
|
||||
pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
|
||||
|
||||
C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));
|
||||
|
||||
in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
|
||||
C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
|
||||
in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);
|
||||
|
||||
p0_uv_8x32_1 = _mm256_add_epi16(temp2, in_macro);
|
||||
q0_uv_8x32_1 = _mm256_sub_epi16(temp3, in_macro);
|
||||
|
||||
|
||||
flag1 = _mm256_and_si256(flag1, flag_bs);
|
||||
flag1 = _mm256_packs_epi16(flag1, flag1); // 0213
|
||||
|
||||
pq0_uv_8x32 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1); //0213
|
||||
|
||||
pq0_uv_8x32_1 = _mm256_and_si256(pq0_uv_32x8,
|
||||
_mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
|
||||
pq0_uv_8x32_2 = _mm256_and_si256(pq0_uv_8x32, flag1);
|
||||
pq0_uv_32x8 = _mm256_add_epi8(pq0_uv_8x32_1, pq0_uv_8x32_2);
|
||||
|
||||
|
||||
t1 = _mm256_unpacklo_epi16(pq1_uv_32x8, pq0_uv_32x8); // temp1 temp3
|
||||
t2 = _mm256_unpackhi_epi16(pq1_uv_32x8, pq0_uv_32x8); // temp2 temp4
|
||||
|
||||
t4 = _mm256_shufflelo_epi16(t2, _MM_SHUFFLE(2, 3, 0, 1)); // pshuflw
|
||||
t4 = _mm256_shufflehi_epi16(t4, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
|
||||
lineae = _mm256_unpacklo_epi32(t1, t4); // temp1 temp3
|
||||
linecg = _mm256_unpackhi_epi32(t1, t4); // temp2 temp4
|
||||
|
||||
linea = _mm256_castsi256_si128(lineae);
|
||||
lineb = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));
|
||||
lineae = _mm256_permute2f128_si256(lineae, lineae, 0x1);
|
||||
linee = _mm256_castsi256_si128(lineae);
|
||||
linef = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));
|
||||
|
||||
|
||||
linec = _mm256_castsi256_si128(linecg);
|
||||
lined = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));
|
||||
linecg = _mm256_permute2f128_si256(linecg, linecg, 0x1);
|
||||
lineg = _mm256_castsi256_si128(linecg);
|
||||
lineh = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));
|
||||
|
||||
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
|
||||
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
|
||||
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
|
||||
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
|
||||
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
|
||||
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
|
||||
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
|
||||
_mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
|
||||
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
/* */
|
||||
/* Function Name : ih264_deblk_chroma_horz_bslt4_avx2() */
|
||||
/* */
|
||||
/* Description : This function performs filtering of a chroma block */
|
||||
/* horizontal edge when the boundary strength is less than */
|
||||
/* 4 in high profile. */
|
||||
/* */
|
||||
/* Inputs : pu1_src - pointer to the src sample q0 of U */
|
||||
/* src_strd - source stride */
|
||||
/* alpha_cb - alpha value for the boundary in U */
|
||||
/* beta_cb - beta value for the boundary in U */
|
||||
/* alpha_cr - alpha value for the boundary in V */
|
||||
/* beta_cr - beta value for the boundary in V */
|
||||
/* u4_bs - packed Boundary strength array */
|
||||
/* pu1_cliptab_cb - tc0_table for U */
|
||||
/* pu1_cliptab_cr - tc0_table for V */
|
||||
/* */
|
||||
/* Globals : None */
|
||||
/* */
|
||||
/* Processing : This operation is described in Sec. 8.7.2.3 under the */
|
||||
/* title "Filtering process for edges for bS less than 4" */
|
||||
/* in ITU T Rec H.264 with alpha and beta values different */
|
||||
/* in U and V. */
|
||||
/* */
|
||||
/* Outputs : None */
|
||||
/* */
|
||||
/* Returns : None */
|
||||
/* */
|
||||
/* Issues : None */
|
||||
/* */
|
||||
/* Revision History: */
|
||||
/* */
|
||||
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||||
/* 12 02 2015 Naveen Kumar P Initial version */
|
||||
/* 12 10 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
|
||||
/*****************************************************************************/
|
||||
void ih264_deblk_chroma_horz_bslt4_avx2 (UWORD8 *pu1_src,
|
||||
WORD32 src_strd,
|
||||
WORD32 alpha_cb,
|
||||
WORD32 beta_cb,
|
||||
WORD32 alpha_cr,
|
||||
WORD32 beta_cr,
|
||||
UWORD32 u4_bs,
|
||||
const UWORD8 *pu1_cliptab_cb,
|
||||
const UWORD8 *pu1_cliptab_cr)
|
||||
{
|
||||
UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
|
||||
WORD16 i16_posP1, i16_posP0, i16_posQ1;
|
||||
UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
|
||||
|
||||
UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
|
||||
WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
|
||||
WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
|
||||
__m256i p0q0_uv_32x8,p1q1_uv_32x8;
|
||||
__m256i temp1,temp2,temp3,temp4;
|
||||
__m256i flag_bs, flag1, flag2;
|
||||
__m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
|
||||
__m256i zero = _mm256_setzero_si256();
|
||||
__m256i C0_uv_8x32;
|
||||
__m256i p0q0_uv_8x32_1, p0q0_uv_8x32_2,res1,res2,p0_uv_8x32_1,q0_uv_8x32_1;
|
||||
|
||||
pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
|
||||
|
||||
i16_posQ1 = src_strd;
|
||||
i16_posP0 = src_strd;
|
||||
i16_posP1 = 0;
|
||||
|
||||
u1_Bs0 = (u4_bs >> 24) & 0xff;
|
||||
u1_Bs1 = (u4_bs >> 16) & 0xff;
|
||||
u1_Bs2 = (u4_bs >> 8) & 0xff;
|
||||
u1_Bs3 = (u4_bs >> 0) & 0xff;
|
||||
|
||||
flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
|
||||
u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
|
||||
u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
|
||||
u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
|
||||
u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
|
||||
u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,
|
||||
u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
|
||||
u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
|
||||
flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
|
||||
flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask
|
||||
|
||||
p0q0_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv), (__m128i *)(pu1_HorzPixelUV + i16_posP0));
|
||||
p1q1_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv + i16_posQ1), (__m128i *)(pu1_HorzPixelUV + i16_posP1));
|
||||
|
||||
res1 = _mm256_permute4x64_epi64(p0q0_uv_32x8,0xD8);
|
||||
res2 = _mm256_permute4x64_epi64(p1q1_uv_32x8,0xD8);
|
||||
|
||||
temp3 = _mm256_unpacklo_epi8(res1, zero); //p0 l 0 h 0
|
||||
temp4 = _mm256_unpackhi_epi8(res1, zero); //q0
|
||||
temp1 = _mm256_unpacklo_epi8(res2, zero); //p1
|
||||
temp2 = _mm256_unpackhi_epi8(res2, zero); //q1
|
||||
|
||||
diff = _mm256_subs_epi16(temp3, temp4); //Condn 1 //p0 l h - q0 l h
|
||||
diff = _mm256_abs_epi16(diff);
|
||||
alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
|
||||
flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);
|
||||
|
||||
diff = _mm256_subs_epi16(temp2, temp4); //Condtn 2
|
||||
diff = _mm256_abs_epi16(diff);
|
||||
beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
|
||||
flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
|
||||
|
||||
diff = _mm256_subs_epi16(temp1, temp3); //Condtn 3
|
||||
diff = _mm256_abs_epi16(diff);
|
||||
flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
|
||||
|
||||
diff = _mm256_subs_epi16(temp4, temp3);
|
||||
diff = _mm256_slli_epi16(diff, 2);
|
||||
diff1 = _mm256_subs_epi16(temp1, temp2);
|
||||
diff = _mm256_add_epi16(diff, diff1);
|
||||
diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
|
||||
in_macro = _mm256_srai_epi16(diff, 3);
|
||||
|
||||
C0_uv_8x32 = _mm256_set_epi16(
|
||||
pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
|
||||
pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
|
||||
pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
|
||||
pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
|
||||
pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
|
||||
pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
|
||||
pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
|
||||
pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
|
||||
|
||||
C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));
|
||||
|
||||
in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
|
||||
C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
|
||||
in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);
|
||||
|
||||
p0_uv_8x32_1 = _mm256_add_epi16(temp3, in_macro);
|
||||
q0_uv_8x32_1 = _mm256_sub_epi16(temp4, in_macro);
|
||||
|
||||
p0q0_uv_8x32_2 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1);
|
||||
flag1 = _mm256_packs_epi16(flag1, flag1);
|
||||
flag1 = _mm256_and_si256(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
|
||||
|
||||
p0q0_uv_8x32_1 = _mm256_and_si256(res1,
|
||||
_mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
|
||||
p0q0_uv_8x32_2 = _mm256_and_si256(p0q0_uv_8x32_2, flag1);
|
||||
p0q0_uv_8x32_1 = _mm256_add_epi8(p0q0_uv_8x32_1, p0q0_uv_8x32_2);
|
||||
p0q0_uv_8x32_1 = _mm256_permute4x64_epi64(p0q0_uv_8x32_1,0xD8);
|
||||
|
||||
_mm256_storeu2_m128i((__m128i *)(pu1_src_uv),(__m128i *)(pu1_HorzPixelUV + i16_posP0), p0q0_uv_8x32_1);
|
||||
|
||||
}
|
||||
275
common/x86/ih264_deblk_luma_avx2.c
Normal file
275
common/x86/ih264_deblk_luma_avx2.c
Normal file
|
|
@ -0,0 +1,275 @@
|
|||
/******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2015 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at:
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*****************************************************************************
|
||||
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
||||
*/
|
||||
/*****************************************************************************/
|
||||
/* */
|
||||
/* File Name : ih264_deblk_luma_avx2.c */
|
||||
/* */
|
||||
/* Description : Contains function definitions for deblocking */
|
||||
/* */
|
||||
/* List of Functions : ih264_deblk_luma_horz_bslt4_avx2() */
|
||||
/* ih264_deblk_luma_vert_bslt4_avx2() */
|
||||
/* */
|
||||
/* Issues / Problems : None */
|
||||
/* */
|
||||
/* Revision History : */
|
||||
/* */
|
||||
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||||
/* 12 02 2015 Naveen Kumar P Added luma deblocking ssse3 */
|
||||
/* intrinsics */
|
||||
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
|
||||
/*****************************************************************************/
|
||||
|
||||
/*****************************************************************************/
|
||||
/* File Includes */
|
||||
/*****************************************************************************/
|
||||
|
||||
/* System include files */
|
||||
#include <stdio.h>
|
||||
#ifdef __ANDROID__
|
||||
#include "log/log.h"
|
||||
#include <cutils/log.h>
|
||||
#endif
|
||||
|
||||
/* User include files */
|
||||
#include "ih264_typedefs.h"
|
||||
#include "ih264_platform_macros.h"
|
||||
#include "ih264_deblk_edge_filters.h"
|
||||
#include "ih264_macros.h"
|
||||
|
||||
|
||||
/*****************************************************************************/
|
||||
/* */
|
||||
/* Function Name : ih264_deblk_luma_horz_bslt4_avx2() */
|
||||
/* */
|
||||
/* Description : This function performs filtering of a luma block */
|
||||
/* horizontal edge when boundary strength is less than 4. */
|
||||
/* */
|
||||
/* Inputs : pu1_src - pointer to the src sample q0 */
|
||||
/* src_strd - source stride */
|
||||
/* alpha - alpha value for the boundary */
|
||||
/* beta - beta value for the boundary */
|
||||
/* u4_bs - packed Boundary strength array */
|
||||
/* pu1_cliptab - tc0_table */
|
||||
/* */
|
||||
/* Globals : None */
|
||||
/* */
|
||||
/* Processing : This operation is described in Sec. 8.7.2.3 under the */
|
||||
/* title "Filtering process for edges for bS less than 4" */
|
||||
/* in ITU T Rec H.264. */
|
||||
/* */
|
||||
/* Outputs : None */
|
||||
/* */
|
||||
/* Returns : None */
|
||||
/* */
|
||||
/* Issues : None */
|
||||
/* */
|
||||
/* Revision History: */
|
||||
/* */
|
||||
/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
||||
/* 12 02 2015 Naveen Kumar P Initial version */
|
||||
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
|
||||
/*****************************************************************************/
|
||||
void ih264_deblk_luma_horz_bslt4_avx2(UWORD8 *pu1_src,
|
||||
WORD32 src_strd,
|
||||
WORD32 alpha,
|
||||
WORD32 beta,
|
||||
UWORD32 u4_bs,
|
||||
const UWORD8 *pu1_cliptab)
|
||||
{
|
||||
|
||||
WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
|
||||
UWORD8 *pu1_HorzPixel;
|
||||
__m256i zero = _mm256_setzero_si256();
|
||||
__m128i zero_128 = _mm_setzero_si128();
|
||||
__m128i Alpha_8x16,bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16;
|
||||
__m256i Beta_8x32,in_macro_32x8,in_macro_1,in_macro_2,flag1_32x8,flag2_32x8;
|
||||
__m256i C_8x32,C0_8x32_res,temp1,temp2,temp3,temp4,res1,res2,q0p1_32x8,p0q1_32x8;
|
||||
__m128i p0_16x8,q0_16x8,temp1_128,temp2_128,flag1_16x8_128;
|
||||
__m256i const_val4_8x32,p0q0_32x8,p1q1_32x8,p2q2_32x8,q0p0_32x8;
|
||||
UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
|
||||
UWORD8 clip0, clip1, clip2, clip3;
|
||||
|
||||
pu1_HorzPixel = pu1_src - (src_strd << 2);
|
||||
|
||||
i16_posQ1 = src_strd;
|
||||
i16_posQ2 = X2(src_strd);
|
||||
i16_posP0 = X3(src_strd);
|
||||
i16_posP1 = X2(src_strd);
|
||||
i16_posP2 = src_strd;
|
||||
|
||||
p0q0_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src), (__m128i *)(pu1_HorzPixel + i16_posP0)); //lower -p0 higher-q0
|
||||
p1q1_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ1), (__m128i *)(pu1_HorzPixel + i16_posP1)); //l= p1, h=q1
|
||||
p2q2_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ2), (__m128i *)(pu1_HorzPixel + i16_posP2));
|
||||
|
||||
u1_Bs0 = (u4_bs >> 24) & 0xff;
|
||||
u1_Bs1 = (u4_bs >> 16) & 0xff;
|
||||
u1_Bs2 = (u4_bs >> 8) & 0xff;
|
||||
u1_Bs3 = (u4_bs >> 0) & 0xff;
|
||||
clip0 = pu1_cliptab[u1_Bs0];
|
||||
clip1 = pu1_cliptab[u1_Bs1];
|
||||
clip2 = pu1_cliptab[u1_Bs2];
|
||||
clip3 = pu1_cliptab[u1_Bs3];
|
||||
|
||||
Alpha_8x16 = _mm_set1_epi16(alpha);
|
||||
Beta_8x32 = _mm256_set1_epi16(beta);
|
||||
|
||||
bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
|
||||
u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
|
||||
u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
|
||||
u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
|
||||
|
||||
C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
|
||||
clip2, clip1, clip1, clip1, clip1, clip0, clip0,
|
||||
clip0, clip0);
|
||||
|
||||
bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero_128);
|
||||
bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
|
||||
C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero_128);
|
||||
C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero_128);
|
||||
C0_8x32_res = _mm256_set_m128i(C0_hi_8x16,C0_8x16);
|
||||
|
||||
//Cond1 (ABS(p0 - q0) < alpha)
|
||||
p0_16x8 = _mm256_castsi256_si128(p0q0_32x8);
|
||||
q0p0_32x8 = _mm256_permute2x128_si256(p0q0_32x8, p0q0_32x8, 0x1);
|
||||
q0_16x8 = _mm256_castsi256_si128(p0q0_32x8);
|
||||
temp1_128 = _mm_subs_epu8(q0_16x8, p0_16x8);
|
||||
temp2_128 = _mm_subs_epu8(p0_16x8, q0_16x8);
|
||||
temp1_128 = _mm_add_epi8(temp1_128, temp2_128);
|
||||
|
||||
temp2_128 = _mm_unpacklo_epi8(temp1_128, zero_128);
|
||||
temp1_128 = _mm_unpackhi_epi8(temp1_128, zero_128);
|
||||
|
||||
temp2_128 = _mm_cmpgt_epi16(Alpha_8x16, temp2_128);
|
||||
temp1_128 = _mm_cmpgt_epi16(Alpha_8x16, temp1_128);
|
||||
flag1_16x8_128 = _mm_packs_epi16(temp2_128, temp1_128);
|
||||
flag1_16x8_128 = _mm_and_si128(flag1_16x8_128, bs_flag_16x8b);
|
||||
|
||||
flag1_32x8 = _mm256_set_m128i(flag1_16x8_128,flag1_16x8_128);
|
||||
|
||||
//Cond2 (ABS(q1 - q0) < beta) & Cond3 (ABS(p1 - p0) < beta)
|
||||
temp1 = _mm256_subs_epu8(p0q0_32x8, p1q1_32x8);
|
||||
temp2 = _mm256_subs_epu8(p1q1_32x8, p0q0_32x8);
|
||||
temp1 = _mm256_add_epi8(temp1, temp2);
|
||||
|
||||
temp2 = _mm256_unpacklo_epi8(temp1, zero);
|
||||
temp1 = _mm256_unpackhi_epi8(temp1, zero);
|
||||
|
||||
temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
|
||||
temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);
|
||||
|
||||
flag2_32x8 = _mm256_packs_epi16(temp2, temp1);
|
||||
|
||||
//!((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
|
||||
flag1_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);
|
||||
|
||||
//(ABS(p2 - p0) < beta) & (ABS(q2 - q0) < beta)
|
||||
temp1 = _mm256_subs_epu8(p0q0_32x8, p2q2_32x8);
|
||||
temp2 = _mm256_subs_epu8(p2q2_32x8, p0q0_32x8);
|
||||
temp1 = _mm256_add_epi8(temp1, temp2);
|
||||
|
||||
temp2 = _mm256_unpacklo_epi8(temp1, zero);
|
||||
temp1 = _mm256_unpackhi_epi8(temp1, zero);
|
||||
temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
|
||||
temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);
|
||||
|
||||
flag2_32x8 = _mm256_packs_epi16(temp2, temp1);
|
||||
flag2_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);
|
||||
|
||||
temp2 = _mm256_subs_epi16(zero, temp2);
|
||||
temp1 = _mm256_subs_epi16(zero, temp1);
|
||||
|
||||
temp3 = _mm256_permute2x128_si256(temp2,temp1,0x20); // low adding
|
||||
temp4 = _mm256_permute2x128_si256(temp2,temp1,0x31); //high adding
|
||||
temp2 = _mm256_add_epi16(temp3,temp4);
|
||||
C_8x32 = _mm256_add_epi16(C0_8x32_res, temp2); //
|
||||
const_val4_8x32 = _mm256_set1_epi16(4);
|
||||
|
||||
res1 = _mm256_permute4x64_epi64(q0p0_32x8, 0xD8);
|
||||
res2 = _mm256_permute4x64_epi64(p1q1_32x8, 0xD8);
|
||||
|
||||
temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res1, zero),
|
||||
_mm256_unpackhi_epi8(res1, zero));
|
||||
temp4 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res2, zero),
|
||||
_mm256_unpackhi_epi8(res2, zero));
|
||||
|
||||
temp1 = _mm256_slli_epi16(temp3, 2);
|
||||
temp1 = _mm256_add_epi16(temp1, temp4);
|
||||
temp1 = _mm256_add_epi16(temp1, const_val4_8x32);
|
||||
in_macro_32x8 = _mm256_srai_epi16(temp1, 3);
|
||||
|
||||
in_macro_32x8 = _mm256_min_epi16(C_8x32, in_macro_32x8); //CLIP3
|
||||
C_8x32 = _mm256_subs_epi16(zero, C_8x32);
|
||||
in_macro_32x8 = _mm256_max_epi16(C_8x32, in_macro_32x8); //CLIP3
|
||||
|
||||
temp3 = _mm256_unpacklo_epi8(res1, zero); //q0
|
||||
temp4 = _mm256_unpackhi_epi8(res1, zero); //p0
|
||||
|
||||
temp1 = _mm256_add_epi16(temp4, in_macro_32x8);
|
||||
temp2 = _mm256_sub_epi16(temp3, in_macro_32x8);
|
||||
|
||||
temp1 = _mm256_packus_epi16(temp2, temp1); // Suffle needed
|
||||
|
||||
temp1 = _mm256_and_si256(temp1, flag1_32x8); //q0 p0
|
||||
|
||||
temp2 = _mm256_and_si256(res1,
|
||||
_mm256_xor_si256(flag1_32x8, _mm256_set1_epi16(0xFFFF)));
|
||||
|
||||
temp1 = _mm256_add_epi8(temp1, temp2);
|
||||
temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
|
||||
_mm256_storeu2_m128i((__m128i *)(pu1_HorzPixel + i16_posP0),(__m128i *)(pu1_src),temp1);
|
||||
|
||||
//if(Ap < Beta) if(Aq < Beta)
|
||||
temp1 = _mm256_avg_epu16(_mm256_unpacklo_epi8(res1, zero),
|
||||
_mm256_unpackhi_epi8(res1, zero));
|
||||
|
||||
temp2 = _mm256_slli_epi16(_mm256_unpacklo_epi8(p1q1_32x8, zero), 1);
|
||||
temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(p2q2_32x8, zero), temp2);
|
||||
|
||||
temp2 = _mm256_slli_epi16(_mm256_unpackhi_epi8(p1q1_32x8, zero), 1);
|
||||
temp2 = _mm256_subs_epi16(_mm256_unpackhi_epi8(p2q2_32x8, zero), temp2);
|
||||
|
||||
temp4 = _mm256_permute2x128_si256(temp3, temp2, 0x20); //p0 q0
|
||||
temp3 = _mm256_permute2x128_si256(temp3, temp2, 0x31);
|
||||
temp4 = _mm256_add_epi16(temp1, temp4); //p
|
||||
in_macro_1 = _mm256_srai_epi16(temp4, 1);
|
||||
temp3 = _mm256_add_epi16(temp1, temp3); //q
|
||||
in_macro_2 = _mm256_srai_epi16(temp3, 1);
|
||||
|
||||
in_macro_1 = _mm256_min_epi16(C0_8x32_res, in_macro_1); //CLIP3
|
||||
C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
|
||||
in_macro_1 = _mm256_max_epi16(C0_8x32_res, in_macro_1); //CLIP3
|
||||
|
||||
in_macro_2 = _mm256_max_epi16(C0_8x32_res, in_macro_2); //CLIP3
|
||||
C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
|
||||
in_macro_2 = _mm256_min_epi16(C0_8x32_res, in_macro_2); //CLIP3
|
||||
|
||||
temp1 = _mm256_unpacklo_epi8(res2, zero);
|
||||
temp2 = _mm256_unpackhi_epi8(res2, zero);
|
||||
|
||||
temp1 = _mm256_add_epi16(temp1, in_macro_1);
|
||||
temp2 = _mm256_add_epi16(temp2, in_macro_2);
|
||||
temp1 = _mm256_packus_epi16(temp1, temp2); // pl ph ql qh
|
||||
temp1 = _mm256_and_si256(temp1, flag2_32x8);
|
||||
temp2 = _mm256_and_si256(res2,_mm256_xor_si256(flag2_32x8, _mm256_set1_epi16(0xFFFF)));
|
||||
temp1 = _mm256_add_epi8(temp1, temp2);
|
||||
temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
|
||||
_mm256_storeu2_m128i((__m128i *)(pu1_src + i16_posQ1),(__m128i *)(pu1_HorzPixel + i16_posP1),temp1);
|
||||
|
||||
}
|
||||
184
common/x86/ih264_ihadamard_scaling_avx2.c
Normal file
184
common/x86/ih264_ihadamard_scaling_avx2.c
Normal file
|
|
@ -0,0 +1,184 @@
|
|||
/******************************************************************************
|
||||
+ *
|
||||
+ * Copyright (C) 2015 The Android Open Source Project
|
||||
+ *
|
||||
+ * Licensed under the Apache License, Version 2.0 (the "License");
|
||||
+ * you may not use this file except in compliance with the License.
|
||||
+ * You may obtain a copy of the License at:
|
||||
+ *
|
||||
+ * http://www.apache.org/licenses/LICENSE-2.0
|
||||
+ *
|
||||
+ * Unless required by applicable law or agreed to in writing, software
|
||||
+ * distributed under the License is distributed on an "AS IS" BASIS,
|
||||
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
+ * See the License for the specific language governing permissions and
|
||||
+ * limitations under the License.
|
||||
+ *
|
||||
+ *****************************************************************************
|
||||
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
||||
+*/
|
||||
/**
|
||||
+ *******************************************************************************
|
||||
+ * @file
|
||||
+ * ih264_ihadamard_scaling_avx2.c
|
||||
+ *
|
||||
+ * @brief
|
||||
+ * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling
|
||||
+ *
|
||||
+ * @author
|
||||
+ * Priyanka
|
||||
+ *
|
||||
+ * @par List of Functions:
|
||||
+ * - ih264_ihadamard_scaling_4x4_avx2()
|
||||
+ *
|
||||
+ * @remarks
|
||||
+ *
|
||||
+ *******************************************************************************
|
||||
+ */
|
||||
/*****************************************************************************/
|
||||
/* File Includes */
|
||||
/*****************************************************************************/
|
||||
|
||||
/* User include files */
|
||||
#include "ih264_typedefs.h"
|
||||
#include "ih264_defs.h"
|
||||
#include "ih264_trans_macros.h"
|
||||
#include "ih264_macros.h"
|
||||
#include "ih264_trans_data.h"
|
||||
#include "ih264_size_defs.h"
|
||||
#include "ih264_structs.h"
|
||||
#include "ih264_trans_quant_itrans_iquant.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
/*
|
||||
+ ********************************************************************************
|
||||
+ *
|
||||
+ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
|
||||
+ * of a 16x16 intra prediction macroblock, and then performs scaling.
|
||||
+ * prediction buffer
|
||||
+ *
|
||||
+ * @par Description:
|
||||
+ * The DC coefficients pass through a 2-stage inverse hadamard transform.
|
||||
+ * This inverse transformed content is scaled to based on Qp value.
|
||||
+ *
|
||||
+ * @param[in] pi2_src
|
||||
+ * input 4x4 block of DC coefficients
|
||||
+ *
|
||||
+ * @param[out] pi2_out
|
||||
+ * output 4x4 block
|
||||
+ *
|
||||
+ * @param[in] pu2_iscal_mat
|
||||
+ * pointer to scaling list
|
||||
+ *
|
||||
+ * @param[in] pu2_weigh_mat
|
||||
+ * pointer to weight matrix
|
||||
+ *
|
||||
+ * @param[in] u4_qp_div_6
|
||||
+ * Floor (qp/6)
|
||||
+ *
|
||||
+ * @param[in] pi4_tmp
|
||||
+ * temporary buffer of size 1*16
|
||||
+ *
|
||||
+ * @returns none
|
||||
+ *
|
||||
+ * @remarks none
|
||||
+ *
|
||||
+ *******************************************************************************
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __ANDROID__
|
||||
#include "log/log.h"
|
||||
#include <cutils/log.h>
|
||||
#endif
|
||||
|
||||
|
||||
void ih264_ihadamard_scaling_4x4_avx2(WORD16* pi2_src,
|
||||
WORD16* pi2_out,
|
||||
const UWORD16 *pu2_iscal_mat,
|
||||
const UWORD16 *pu2_weigh_mat,
|
||||
UWORD32 u4_qp_div_6,
|
||||
WORD32* pi4_tmp)
|
||||
{
|
||||
__m256i src,r0_r1,r2_r3,r3_r2,r1_r3,r0_r2;
|
||||
__m256i src_r0_r1, src_r2_r3;
|
||||
__m256i temp0, temp1,tmp0, tmp1, tmp2, tmp3;
|
||||
__m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0);
|
||||
__m256i mult_val = _mm256_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
|
||||
__m256i zero = _mm256_setzero_si256();
|
||||
|
||||
__m128i t0 ,t1;
|
||||
UNUSED (pi4_tmp);
|
||||
|
||||
src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
|
||||
|
||||
temp0 = _mm256_unpacklo_epi64(src_r0_r1, zero);
|
||||
temp1 = _mm256_unpackhi_epi64(src_r0_r1, zero); // b0 b1 b2.. d0 d1...
|
||||
temp0 = _mm256_unpacklo_epi16(temp0, temp1);
|
||||
tmp0 = _mm256_permute2x128_si256(temp0,zero,0x20); //tmp0 tmp3
|
||||
tmp1 = _mm256_permute2x128_si256(temp0,zero,0x31); //tmp1 tmp2
|
||||
|
||||
temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
|
||||
temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
|
||||
|
||||
temp1 = _mm256_shuffle_epi32(temp1,0b01001110);
|
||||
tmp0 = _mm256_add_epi16(temp0, temp1);
|
||||
tmp1 = _mm256_sub_epi16(temp0, temp1);
|
||||
|
||||
temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);
|
||||
temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
|
||||
tmp0 = _mm256_add_epi16(temp0, temp1);
|
||||
tmp1 = _mm256_sub_epi16(temp0, temp1);
|
||||
|
||||
temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
|
||||
temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
|
||||
|
||||
temp0 = _mm256_unpacklo_epi64(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
|
||||
temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
|
||||
|
||||
tmp0 = _mm256_unpacklo_epi16(temp0, temp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
|
||||
tmp1 = _mm256_unpackhi_epi16(temp0, temp1);
|
||||
|
||||
temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
|
||||
temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
|
||||
|
||||
temp1 = _mm256_shuffle_epi32(temp1, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
tmp0 = _mm256_add_epi16(temp0, temp1);
|
||||
tmp1 = _mm256_sub_epi16(temp0, temp1);
|
||||
temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);
|
||||
temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
|
||||
tmp0 = _mm256_add_epi16(temp0, temp1);
|
||||
tmp1 = _mm256_sub_epi16(temp0, temp1);
|
||||
|
||||
temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);
|
||||
temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
|
||||
|
||||
|
||||
r0_r1 =_mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp0));
|
||||
r2_r3 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp1));
|
||||
|
||||
src_r0_r1 = _mm256_mullo_epi32(r0_r1, mult_val);
|
||||
src_r2_r3 = _mm256_mullo_epi32(r2_r3, mult_val);
|
||||
|
||||
//Scaling
|
||||
if(u4_qp_div_6 >= 6)
|
||||
{
|
||||
src_r0_r1 = _mm256_slli_epi32(src_r0_r1, u4_qp_div_6 - 6);
|
||||
src_r2_r3 = _mm256_slli_epi32(src_r2_r3, u4_qp_div_6 - 6);
|
||||
}
|
||||
else
|
||||
{
|
||||
temp0 = _mm256_add_epi32(src_r0_r1, add_rshift);
|
||||
temp1 = _mm256_add_epi32(src_r2_r3, add_rshift);
|
||||
src_r0_r1 = _mm256_srai_epi32(temp0, 6 - u4_qp_div_6);
|
||||
src_r2_r3 = _mm256_srai_epi32(temp1, 6 - u4_qp_div_6);
|
||||
}
|
||||
|
||||
src = _mm256_packs_epi32(src_r0_r1, src_r2_r3);
|
||||
_mm256_storeu_si256((__m256i *) (&pi2_out[0]), src);
|
||||
}
|
||||
|
||||
959
common/x86/ih264_inter_pred_filters_avx2.c
Normal file
959
common/x86/ih264_inter_pred_filters_avx2.c
Normal file
|
|
@ -0,0 +1,959 @@
|
|||
/******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2015 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at:
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*****************************************************************************
|
||||
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
||||
*/
|
||||
/*****************************************************************************/
|
||||
/*****************************************************************************/
|
||||
/*****************************************************************************/
|
||||
/* File Includes */
|
||||
/*****************************************************************************/
|
||||
|
||||
#ifdef __ANDROID__
|
||||
#include "log/log.h"
|
||||
#include <cutils/log.h>
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "ih264_typedefs.h"
|
||||
#include "ih264_macros.h"
|
||||
#include "ih264_platform_macros.h"
|
||||
#include "ih264_inter_pred_filters.h"
|
||||
|
||||
/*****************************************************************************/
|
||||
/* Constant Data variables */
|
||||
/*****************************************************************************/
|
||||
|
||||
/* coefficients for 6 tap filtering*/
|
||||
//const WORD32 ih264_g_six_tap[3] ={1,-5,20};
|
||||
/*****************************************************************************/
|
||||
/* Function definitions . */
|
||||
/*****************************************************************************/
|
||||
/*****************************************************************************/
|
||||
/* */
|
||||
/* Function Name : ih264_inter_pred_luma_copy_avx2 */
|
||||
/* */
|
||||
/* Description : This function copies the contents of ht x wd block from */
|
||||
/* source to destination. (ht,wd) can be (4,4), (8,4), */
|
||||
/* (4,8), (8,8), (16,8), (8,16) or (16,16). */
|
||||
/* */
|
||||
/* Inputs : puc_src - pointer to source */
|
||||
/* puc_dst - pointer to destination */
|
||||
/* src_strd - stride for source */
|
||||
/* dst_strd - stride for destination */
|
||||
/* ht - height of the block */
|
||||
/* wd - width of the block */
|
||||
/* */
|
||||
/* Issues : None */
|
||||
/* */
|
||||
/* Revision History: */
|
||||
/* */
|
||||
/* DD MM YYYY Author(s) Changes */
|
||||
/* 13 02 2015 Kaushik Initial Version */
|
||||
/* Senthoor */
|
||||
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
|
||||
/*****************************************************************************/
|
||||
void ih264_inter_pred_luma_copy_avx2(UWORD8 *pu1_src,
|
||||
UWORD8 *pu1_dst,
|
||||
WORD32 src_strd,
|
||||
WORD32 dst_strd,
|
||||
WORD32 ht,
|
||||
WORD32 wd,
|
||||
UWORD8* pu1_tmp,
|
||||
WORD32 dydx)
|
||||
{
|
||||
WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4;
|
||||
UNUSED(pu1_tmp);
|
||||
UNUSED(dydx);
|
||||
|
||||
src_strd2 = src_strd << 1;
|
||||
dst_strd2 = dst_strd << 1;
|
||||
src_strd4 = src_strd << 2;
|
||||
dst_strd4 = dst_strd << 2;
|
||||
src_strd3 = src_strd2 + src_strd;
|
||||
dst_strd3 = dst_strd2 + dst_strd;
|
||||
if(wd == 4)
|
||||
{
|
||||
do
|
||||
{
|
||||
*((WORD32 *)(pu1_dst)) = *((WORD32 *)(pu1_src));
|
||||
*((WORD32 *)(pu1_dst + dst_strd)) = *((WORD32 *)(pu1_src + src_strd));
|
||||
*((WORD32 *)(pu1_dst + dst_strd2)) = *((WORD32 *)(pu1_src + src_strd2));
|
||||
*((WORD32 *)(pu1_dst + dst_strd3)) = *((WORD32 *)(pu1_src + src_strd3));
|
||||
|
||||
ht -= 4;
|
||||
pu1_src += src_strd4;
|
||||
pu1_dst += dst_strd4;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
else if(wd == 8)
|
||||
{
|
||||
__m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
|
||||
do
|
||||
{
|
||||
|
||||
y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
|
||||
y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
|
||||
y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2));
|
||||
y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3));
|
||||
|
||||
_mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
|
||||
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
|
||||
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b);
|
||||
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b);
|
||||
|
||||
ht -= 4;
|
||||
pu1_src += src_strd4;
|
||||
pu1_dst += dst_strd4;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
else // wd == 16
|
||||
{
|
||||
__m256i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
|
||||
WORD32 src_strd5, src_strd6, src_strd7, src_strd8;
|
||||
WORD32 dst_strd5, dst_strd6, dst_strd7, dst_strd8;
|
||||
|
||||
__m256i y_4_16x8b, y_5_16x8b, y_6_16x8b, y_7_16x8b,y_0_1,y_2_3,y_4_5,y_6_7;
|
||||
|
||||
|
||||
src_strd5 = src_strd2 + src_strd3;
|
||||
dst_strd5 = dst_strd2 + dst_strd3;
|
||||
src_strd6 = src_strd3 << 1;
|
||||
dst_strd6 = dst_strd3 << 1;
|
||||
src_strd7 = src_strd3 + src_strd4;
|
||||
dst_strd7 = dst_strd3 + dst_strd4;
|
||||
src_strd8 = src_strd << 3;
|
||||
dst_strd8 = dst_strd << 3;
|
||||
|
||||
do
|
||||
{
|
||||
|
||||
y_0_1 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd),(__m128i *)pu1_src);
|
||||
y_2_3 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd3),(__m128i *)(pu1_src + src_strd2));
|
||||
y_4_5 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd5),(__m128i *)(pu1_src + src_strd4));
|
||||
y_6_7 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd7),(__m128i *)(pu1_src + src_strd6));
|
||||
|
||||
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)pu1_dst,y_0_1);
|
||||
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd3),(__m128i *)(pu1_dst + dst_strd2),y_2_3);
|
||||
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd5),(__m128i *)(pu1_dst + dst_strd4),y_4_5);
|
||||
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd7),(__m128i *)(pu1_dst + dst_strd6),y_6_7);
|
||||
|
||||
ht -= 8;
|
||||
pu1_src += src_strd8;
|
||||
pu1_dst += dst_strd8;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*****************************************************************************/
|
||||
/* */
|
||||
/* Function Name : ih264_inter_pred_chroma_avx2 */
|
||||
/* */
|
||||
/* Description : This function implements a four-tap 2D filter as */
|
||||
/* mentioned in sec. 8.4.2.2.2 titled "Chroma sample */
|
||||
/* "interpolation process". (ht,wd) can be (2,2), (4,2), */
|
||||
/* (2,4), (4,4), (8,4), (4,8) or (8,8). */
|
||||
/* */
|
||||
/* Inputs : puc_src - pointer to source */
|
||||
/* puc_dst - pointer to destination */
|
||||
/* src_strd - stride for source */
|
||||
/* dst_strd - stride for destination */
|
||||
/* dx - x position of destination value */
|
||||
/* dy - y position of destination value */
|
||||
/* ht - height of the block */
|
||||
/* wd - width of the block */
|
||||
/* */
|
||||
/* Issues : None */
|
||||
/* */
|
||||
/* Revision History: */
|
||||
/* */
|
||||
/* DD MM YYYY Author(s) Changes */
|
||||
/* 13 02 2015 Kaushik Initial Version */
|
||||
/* Senthoor */
|
||||
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
|
||||
/*****************************************************************************/
|
||||
void ih264_inter_pred_chroma_avx2(UWORD8 *pu1_src,
|
||||
UWORD8 *pu1_dst,
|
||||
WORD32 src_strd,
|
||||
WORD32 dst_strd,
|
||||
WORD32 dx,
|
||||
WORD32 dy,
|
||||
WORD32 ht,
|
||||
WORD32 wd)
|
||||
{
|
||||
|
||||
WORD32 i, j, A, B, C, D;
|
||||
i = 8 - dx;
|
||||
j = 8 - dy;
|
||||
|
||||
A = i * j;
|
||||
B = dx * j;
|
||||
C = i * dy;
|
||||
D = dx * dy;
|
||||
if(wd == 2)
|
||||
{
|
||||
WORD32 tmp1, tmp2, tmp3, tmp4;
|
||||
|
||||
do
|
||||
{
|
||||
//U
|
||||
tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
|
||||
tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
|
||||
//V
|
||||
tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
|
||||
tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];
|
||||
|
||||
tmp1 = (tmp1 + 32) >> 6;
|
||||
tmp2 = (tmp2 + 32) >> 6;
|
||||
tmp3 = (tmp3 + 32) >> 6;
|
||||
tmp4 = (tmp4 + 32) >> 6;
|
||||
|
||||
pu1_dst[0] = CLIP_U8(tmp1);
|
||||
pu1_dst[2] = CLIP_U8(tmp2);
|
||||
pu1_dst[1] = CLIP_U8(tmp3);
|
||||
pu1_dst[3] = CLIP_U8(tmp4);
|
||||
|
||||
pu1_src += src_strd;
|
||||
pu1_dst += dst_strd;
|
||||
|
||||
tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
|
||||
tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
|
||||
tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
|
||||
tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];
|
||||
|
||||
tmp1 = (tmp1 + 32) >> 6;
|
||||
tmp2 = (tmp2 + 32) >> 6;
|
||||
tmp3 = (tmp3 + 32) >> 6;
|
||||
tmp4 = (tmp4 + 32) >> 6;
|
||||
|
||||
pu1_dst[0] = CLIP_U8(tmp1);
|
||||
pu1_dst[2] = CLIP_U8(tmp2);
|
||||
pu1_dst[1] = CLIP_U8(tmp3);
|
||||
pu1_dst[3] = CLIP_U8(tmp4);
|
||||
|
||||
ht -= 2;
|
||||
pu1_src += src_strd;
|
||||
pu1_dst += dst_strd;
|
||||
}
|
||||
while(ht > 0);
|
||||
|
||||
}
|
||||
else if(wd == 4)
|
||||
{
|
||||
WORD32 AB, CD;
|
||||
|
||||
__m256i coeffAB_32x8b, coeffCD_32x8b, round_add32_8x32b;
|
||||
__m256i const_shuff_32x8b;
|
||||
|
||||
__m256i src_r23_32x8b,src_r12_32x8b,res12_AB_8x32b,res12_CD_8x32b,res1_8x32b;
|
||||
__m128i res1,src_r1_16x8b_128;
|
||||
__m128i const_shuff_16x8b_128;
|
||||
|
||||
AB = (B << 8) + A;
|
||||
CD = (D << 8) + C;
|
||||
|
||||
coeffAB_32x8b = _mm256_set1_epi16(AB);
|
||||
coeffCD_32x8b = _mm256_set1_epi16(CD);
|
||||
|
||||
round_add32_8x32b = _mm256_set1_epi16(32);
|
||||
const_shuff_16x8b_128 = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806);
|
||||
const_shuff_32x8b = _mm256_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806,0x03010200, 0x05030402, 0x07050604, 0x09070806);
|
||||
|
||||
|
||||
src_r1_16x8b_128 = _mm_loadu_si128((__m128i *)pu1_src);
|
||||
src_r1_16x8b_128 = _mm_shuffle_epi8(src_r1_16x8b_128, const_shuff_16x8b_128);
|
||||
pu1_src += src_strd;
|
||||
do
|
||||
{
|
||||
|
||||
src_r23_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd),(__m128i *)pu1_src);
|
||||
src_r23_32x8b = _mm256_shuffle_epi8(src_r23_32x8b, const_shuff_32x8b);
|
||||
src_r12_32x8b = _mm256_set_m128i(_mm256_castsi256_si128(src_r23_32x8b),src_r1_16x8b_128);
|
||||
|
||||
res12_AB_8x32b = _mm256_maddubs_epi16(src_r12_32x8b, coeffAB_32x8b);
|
||||
res12_CD_8x32b = _mm256_maddubs_epi16(src_r23_32x8b, coeffCD_32x8b);
|
||||
|
||||
res1_8x32b = _mm256_add_epi16(res12_AB_8x32b, res12_CD_8x32b);
|
||||
res1_8x32b = _mm256_add_epi16(res1_8x32b, round_add32_8x32b);
|
||||
|
||||
res1_8x32b = _mm256_srai_epi16(res1_8x32b, 6);
|
||||
|
||||
res1_8x32b = _mm256_packus_epi16(res1_8x32b,res1_8x32b);
|
||||
res1_8x32b = _mm256_permute4x64_epi64(res1_8x32b, 0xD8);
|
||||
res1 = _mm256_castsi256_si128(res1_8x32b);
|
||||
_mm_storel_epi64((__m128i *)pu1_dst, res1);
|
||||
|
||||
res1 = _mm_srli_si128(res1, 8);
|
||||
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res1);
|
||||
|
||||
src_r1_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(src_r23_32x8b,src_r23_32x8b,0x1));;
|
||||
|
||||
ht -= 2;
|
||||
pu1_src += src_strd << 1;
|
||||
pu1_dst += dst_strd << 1;
|
||||
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
else // wd == 8
|
||||
{
|
||||
|
||||
WORD32 AB, CD;
|
||||
__m256i src_r1lh_32x8b, src_r2lh_32x8b;
|
||||
|
||||
__m256i res_lh_AB_8x32b, res_lh_CD_8x32b,res_1lh_8x32b,res_2lh_8x32b;
|
||||
__m256i res_lh_8x32b, res_l_8x32b,res_h_8x32b, res_32x8b;
|
||||
|
||||
__m256i coeffAB_32x8b, coeffCD_32x8b, round_add32_8x32b;
|
||||
__m256i const_shuff_32x8b;
|
||||
__m256i zero = _mm256_setzero_si256();
|
||||
__m128i res_1,res_2;
|
||||
|
||||
AB = (B << 8) + A;
|
||||
CD = (D << 8) + C;
|
||||
|
||||
coeffAB_32x8b = _mm256_set1_epi16(AB);
|
||||
coeffCD_32x8b = _mm256_set1_epi16(CD);
|
||||
|
||||
round_add32_8x32b = _mm256_set1_epi16(32);
|
||||
|
||||
const_shuff_32x8b = _mm256_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806,0x03010200, 0x05030402, 0x07050604, 0x09070806);
|
||||
|
||||
src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
|
||||
src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b);
|
||||
pu1_src += src_strd;
|
||||
|
||||
do
|
||||
{
|
||||
//row 1
|
||||
|
||||
src_r2lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
|
||||
src_r2lh_32x8b = _mm256_shuffle_epi8(src_r2lh_32x8b, const_shuff_32x8b);
|
||||
res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffAB_32x8b);
|
||||
|
||||
res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffCD_32x8b);
|
||||
res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
|
||||
res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
|
||||
res_1lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
|
||||
|
||||
pu1_src += src_strd;
|
||||
//row 2
|
||||
src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
|
||||
src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b);
|
||||
|
||||
res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffAB_32x8b);
|
||||
res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffCD_32x8b);
|
||||
|
||||
res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
|
||||
res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
|
||||
|
||||
res_2lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
|
||||
|
||||
res_1lh_8x32b = _mm256_packus_epi16(res_1lh_8x32b, res_2lh_8x32b);
|
||||
res_1lh_8x32b = _mm256_permute4x64_epi64(res_1lh_8x32b, 0xD8);
|
||||
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)(pu1_dst),res_1lh_8x32b);
|
||||
|
||||
pu1_src += src_strd;
|
||||
pu1_dst += dst_strd;
|
||||
pu1_dst += dst_strd;
|
||||
|
||||
//row 3
|
||||
src_r2lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
|
||||
src_r2lh_32x8b = _mm256_shuffle_epi8(src_r2lh_32x8b, const_shuff_32x8b);
|
||||
|
||||
res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffAB_32x8b);
|
||||
res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffCD_32x8b);
|
||||
|
||||
res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
|
||||
res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
|
||||
|
||||
res_1lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
|
||||
pu1_src += src_strd;
|
||||
|
||||
//row 1
|
||||
src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
|
||||
src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b);
|
||||
|
||||
res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffAB_32x8b);
|
||||
res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffCD_32x8b);
|
||||
|
||||
res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
|
||||
res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
|
||||
|
||||
res_2lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
|
||||
|
||||
res_1lh_8x32b = _mm256_packus_epi16(res_1lh_8x32b, res_2lh_8x32b);
|
||||
res_1lh_8x32b = _mm256_permute4x64_epi64(res_1lh_8x32b, 0xD8);
|
||||
_mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)(pu1_dst),res_1lh_8x32b);
|
||||
|
||||
|
||||
ht -= 4;
|
||||
pu1_src += src_strd;
|
||||
pu1_dst += dst_strd;
|
||||
pu1_dst += dst_strd;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
/* */
|
||||
/* Function Name : ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2 */
|
||||
/* */
|
||||
/* Description : This function implements a six-tap filter vertically and */
|
||||
/* horizontally on ht x wd block separately and averages */
|
||||
/* the two sets of values to calculate values at (1/4,1/4), */
|
||||
/* (1/4, 3/4), (3/4, 1/4) or (3/4, 3/4) as mentioned in */
|
||||
/* sec. 8.4.2.2.1 titled "Luma sample interpolation */
|
||||
/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */
|
||||
/* (16,8), (8,16) or (16,16). */
|
||||
/* */
|
||||
/* Inputs : puc_src - pointer to source */
|
||||
/* puc_dst - pointer to destination */
|
||||
/* src_strd - stride for source */
|
||||
/* dst_strd - stride for destination */
|
||||
/* ht - height of the block */
|
||||
/* wd - width of the block */
|
||||
/* pu1_tmp - pointer to temporary buffer */
|
||||
/* dydx - x and y reference offset for q-pel */
|
||||
/* calculations */
|
||||
/* */
|
||||
/* Issues : None */
|
||||
/* */
|
||||
/* Revision History: */
|
||||
/* */
|
||||
/* DD MM YYYY Author(s) Changes */
|
||||
/* 13 02 2015 Kaushik Initial Version */
|
||||
/* Senthoor */
|
||||
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
|
||||
/*****************************************************************************/
|
||||
void ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2(UWORD8 *pu1_src,
|
||||
UWORD8 *pu1_dst,
|
||||
WORD32 src_strd,
|
||||
WORD32 dst_strd,
|
||||
WORD32 ht,
|
||||
WORD32 wd,
|
||||
UWORD8* pu1_tmp,
|
||||
WORD32 dydx)
|
||||
{
|
||||
WORD32 ht_temp;
|
||||
UWORD8 *pu1_pred_vert,*pu1_pred_horiz;
|
||||
UWORD8 *pu1_tmp1, *pu1_tmp2;
|
||||
WORD32 x_offset, y_offset;
|
||||
|
||||
__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
|
||||
__m128i const_val16_8x16b;
|
||||
__m256i coeff0_1_32x8b, coeff2_3_32x8b, coeff4_5_32x8b,coeff_32x8b;
|
||||
__m256i const_val16_8x32b;
|
||||
__m256i zero = _mm256_setzero_si256();
|
||||
pu1_tmp1 = pu1_tmp;
|
||||
|
||||
dydx &= 0xf;
|
||||
ht_temp = ht;
|
||||
x_offset = dydx & 0x3;
|
||||
y_offset = dydx >> 2;
|
||||
pu1_tmp2 = pu1_tmp1;
|
||||
|
||||
pu1_pred_vert = pu1_src + (x_offset >> 1) - 2*src_strd;
|
||||
pu1_pred_horiz = pu1_src + (y_offset >> 1) * src_strd - 2;
|
||||
//the filter input starts from x[-2] (till x[3])
|
||||
|
||||
coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
|
||||
coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
|
||||
coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
|
||||
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
|
||||
const_val16_8x16b = _mm_set1_epi16(16);
|
||||
coeff_32x8b = _mm256_set_m128i(coeff2_3_16x8b,coeff0_1_16x8b);
|
||||
coeff0_1_32x8b = _mm256_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
|
||||
coeff2_3_32x8b = _mm256_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
|
||||
coeff4_5_32x8b = _mm256_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
|
||||
//c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
|
||||
const_val16_8x32b = _mm256_set1_epi16(16);
|
||||
|
||||
if(wd == 4)
|
||||
{
|
||||
//vertical q-pel filter
|
||||
{
|
||||
__m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
|
||||
__m128i src_r5_16x8b, src_r6_16x8b;
|
||||
__m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
|
||||
|
||||
__m128i res_r0r1_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
|
||||
|
||||
//epilogue: Load all the pred rows except sixth and seventh row for the
|
||||
//first and second row processing.
|
||||
src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
pu1_pred_vert = pu1_pred_vert + src_strd;
|
||||
|
||||
src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
pu1_pred_vert = pu1_pred_vert + src_strd;
|
||||
src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
|
||||
|
||||
src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
pu1_pred_vert = pu1_pred_vert + src_strd;
|
||||
src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
|
||||
|
||||
src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
pu1_pred_vert = pu1_pred_vert + src_strd;
|
||||
src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
|
||||
|
||||
src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
pu1_pred_vert = pu1_pred_vert + src_strd;
|
||||
src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
|
||||
|
||||
//Core Loop: Process all the rows.
|
||||
do
|
||||
{
|
||||
src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
|
||||
|
||||
src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
|
||||
src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
|
||||
|
||||
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
|
||||
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
|
||||
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
|
||||
|
||||
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
|
||||
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
|
||||
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
|
||||
|
||||
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
|
||||
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
|
||||
res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
|
||||
|
||||
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
|
||||
res_r0r1_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
|
||||
|
||||
_mm_storel_epi64((__m128i *)pu1_tmp1, res_r0r1_16x8b);
|
||||
|
||||
src_r0_16x8b = src_r2_16x8b;
|
||||
src_r1_16x8b = src_r3_16x8b;
|
||||
src_r2_16x8b = src_r4_16x8b;
|
||||
src_r3_16x8b = src_r5_16x8b;
|
||||
src_r4_16x8b = src_r6_16x8b;
|
||||
|
||||
ht_temp -= 2;
|
||||
pu1_pred_vert += src_strd << 1;
|
||||
pu1_tmp1 += 8;
|
||||
}
|
||||
while(ht_temp > 0);
|
||||
}
|
||||
|
||||
//horizontal q-pel filter
|
||||
{
|
||||
__m128i res_r0r1_16x8b_128, src_r0r1_vpel_16x8b,res_16x8b;
|
||||
__m256i src_r0r1_sht_32x8b, src_r0r1_32x8b,src_r0r1_t1_32x8b;
|
||||
|
||||
__m256i res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b;
|
||||
__m256i res_r0r1_32x8b;
|
||||
|
||||
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
|
||||
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
|
||||
|
||||
do
|
||||
{
|
||||
src_r0r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)pu1_tmp2);
|
||||
src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + src_strd), (__m128i *)pu1_pred_horiz);
|
||||
|
||||
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1);
|
||||
src_r0r1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b);
|
||||
|
||||
src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
|
||||
res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
|
||||
//b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
|
||||
|
||||
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0
|
||||
//b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0
|
||||
|
||||
src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
|
||||
res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
|
||||
//b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
|
||||
|
||||
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0
|
||||
//b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0
|
||||
|
||||
src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
|
||||
res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
|
||||
//b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
|
||||
|
||||
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b);
|
||||
res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b);
|
||||
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 15;
|
||||
//a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 15;
|
||||
//a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 15;
|
||||
//a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 15;
|
||||
//b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 15;
|
||||
//b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 15;
|
||||
//b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 15;
|
||||
//b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 15;
|
||||
res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5); //shifting right by 5 bits.
|
||||
|
||||
res_r0r1_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(res_r0r1_t1_8x32b,
|
||||
res_r0r1_t1_8x32b));
|
||||
|
||||
res_16x8b = _mm_avg_epu8(res_r0r1_16x8b_128,src_r0r1_vpel_16x8b);
|
||||
|
||||
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
|
||||
res_16x8b = _mm_srli_si128(res_16x8b, 4);
|
||||
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
|
||||
|
||||
ht -= 2;
|
||||
pu1_pred_horiz += src_strd << 1;
|
||||
pu1_tmp2 += 8;
|
||||
pu1_dst += dst_strd << 1;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
}
|
||||
else if(wd == 8)
|
||||
{
|
||||
//vertical q-pel filter
|
||||
{
|
||||
__m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
|
||||
__m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b;
|
||||
__m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
|
||||
|
||||
__m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
|
||||
|
||||
//epilogue: Load all the pred rows except sixth and seventh row for the
|
||||
//first and second row processing.
|
||||
src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
pu1_pred_vert = pu1_pred_vert + src_strd;
|
||||
|
||||
src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
pu1_pred_vert = pu1_pred_vert + src_strd;
|
||||
src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
|
||||
|
||||
src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
pu1_pred_vert = pu1_pred_vert + src_strd;
|
||||
src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
|
||||
|
||||
src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
pu1_pred_vert = pu1_pred_vert + src_strd;
|
||||
src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
|
||||
|
||||
src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
pu1_pred_vert = pu1_pred_vert + src_strd;
|
||||
src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
|
||||
|
||||
//Core Loop: Process all the rows.
|
||||
do
|
||||
{
|
||||
src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
|
||||
src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
|
||||
|
||||
src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
|
||||
src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
|
||||
|
||||
src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
|
||||
src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
|
||||
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
|
||||
|
||||
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
|
||||
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
|
||||
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
|
||||
|
||||
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
|
||||
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
|
||||
res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
|
||||
|
||||
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
|
||||
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(pu1_tmp1), res_16x8b);
|
||||
|
||||
src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
|
||||
src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
|
||||
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
|
||||
|
||||
res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
|
||||
res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
|
||||
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
|
||||
|
||||
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
|
||||
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
|
||||
res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
|
||||
|
||||
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
|
||||
res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(pu1_tmp1 + 8), res_16x8b);
|
||||
|
||||
src_r0_16x8b = src_r2_16x8b;
|
||||
src_r1_16x8b = src_r3_16x8b;
|
||||
src_r2_16x8b = src_r4_16x8b;
|
||||
src_r3_16x8b = src_r5_16x8b;
|
||||
src_r4_16x8b = src_r6_16x8b;
|
||||
|
||||
ht_temp -= 2;
|
||||
pu1_pred_vert += src_strd << 1;
|
||||
pu1_tmp1 += 16;
|
||||
}
|
||||
while(ht_temp > 0);
|
||||
}
|
||||
|
||||
//horizontal q-pel filter
|
||||
{
|
||||
|
||||
__m256i src_r0r1_32x8b, src_r0r1_sht_32x8b,src_r0r1_t1_32x8b,res_32x8b;
|
||||
__m128i src_r0r1_vpel_128,res_16x8b_128;
|
||||
__m256i src_r0r1_vpel_32x8b,res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b;
|
||||
|
||||
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
|
||||
//Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
|
||||
|
||||
do
|
||||
{
|
||||
src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + src_strd), //a2 a3 a4 a5 a6 a7 a8....a15 0 or
|
||||
(__m128i *)pu1_pred_horiz); //a3 a4 a5 a6 a7 a8 a9....a15 0
|
||||
//b2 b3 b4 b5 b6 b7 b8....b15 0 or
|
||||
//b3 b4 b5 b6 b7 b8 b9....b15 0
|
||||
|
||||
src_r0r1_vpel_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_tmp2 + 8),
|
||||
(__m128i *)(pu1_tmp2));
|
||||
src_r0r1_vpel_32x8b = _mm256_unpacklo_epi64(src_r0r1_vpel_32x8b,zero);
|
||||
|
||||
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
|
||||
//b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
|
||||
|
||||
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
|
||||
|
||||
res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
|
||||
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
|
||||
|
||||
//b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
|
||||
|
||||
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
|
||||
//b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
|
||||
|
||||
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
|
||||
|
||||
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
|
||||
//b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
|
||||
|
||||
res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
|
||||
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
|
||||
//b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
|
||||
//b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
|
||||
|
||||
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
|
||||
|
||||
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
|
||||
//b5 b6 b7 b8 b9....b15 0 0 0 0 0
|
||||
|
||||
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
|
||||
//b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
|
||||
|
||||
res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
|
||||
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
|
||||
//b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
|
||||
|
||||
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b);
|
||||
res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b);
|
||||
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b);
|
||||
res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5); //shifting right by 5 bits.
|
||||
|
||||
res_32x8b = _mm256_packus_epi16(res_r0r1_t1_8x32b, res_r0r1_t1_8x32b);
|
||||
|
||||
res_32x8b = _mm256_avg_epu8(res_32x8b,src_r0r1_vpel_32x8b);
|
||||
|
||||
_mm_storel_epi64((__m128i *)(pu1_dst), _mm256_castsi256_si128(res_32x8b));
|
||||
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), _mm256_castsi256_si128(_mm256_permute2x128_si256(res_32x8b,res_32x8b,0x1)));
|
||||
|
||||
ht -= 2;
|
||||
pu1_pred_horiz += src_strd << 1;
|
||||
pu1_dst += dst_strd << 1;
|
||||
pu1_tmp2 += 16;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
}
|
||||
else // wd == 16
|
||||
{
|
||||
//vertical q-pel filter
|
||||
{
|
||||
__m256i src_r0r2_32x8b, src_r1r3_32x8b, src_r2r4_32x8b,src_32x8b;
|
||||
__m128i src_r2_16x8b,src_r4_16x8b,src_r5_16x8b,src_r6_16x8b,src_r4r5_16x8b;
|
||||
|
||||
__m256i res_t1_8x32b;
|
||||
__m128i res_t0_8x16b,res_t3_8x16b,res_t1_8x16b,res_16x8b;
|
||||
|
||||
//epilogue: Load all the pred rows except sixth and seventh row for the
|
||||
//first and second row processing.
|
||||
src_r0r2_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_vert + 2 * src_strd ),
|
||||
(__m128i *)(pu1_pred_vert));
|
||||
|
||||
src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + 2 * src_strd));
|
||||
src_r1r3_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_vert + 3 * src_strd ),
|
||||
(__m128i *)(pu1_pred_vert + src_strd));
|
||||
src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + 4 * src_strd));
|
||||
pu1_pred_vert = pu1_pred_vert + (5 * src_strd ) ;
|
||||
|
||||
|
||||
//Core Loop: Process all the rows.
|
||||
do
|
||||
{
|
||||
src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
|
||||
src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + src_strd));
|
||||
src_r2r4_32x8b = _mm256_set_m128i(src_r4_16x8b,src_r2_16x8b);
|
||||
|
||||
src_32x8b = _mm256_unpacklo_epi8(src_r0r2_32x8b, src_r1r3_32x8b);
|
||||
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
|
||||
|
||||
res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
|
||||
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
|
||||
|
||||
res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256 (res_t1_8x32b,0x1));
|
||||
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
|
||||
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
|
||||
res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits
|
||||
|
||||
|
||||
src_32x8b = _mm256_unpackhi_epi8(src_r0r2_32x8b, src_r1r3_32x8b);
|
||||
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
|
||||
|
||||
res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
|
||||
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
|
||||
|
||||
res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b),_mm256_extracti128_si256 (res_t1_8x32b,0x1));
|
||||
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
|
||||
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
|
||||
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
|
||||
|
||||
res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
|
||||
_mm_storeu_si128((__m128i *)(pu1_tmp1), res_16x8b);
|
||||
|
||||
src_32x8b = _mm256_unpacklo_epi8(src_r1r3_32x8b, src_r2r4_32x8b);
|
||||
src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
|
||||
|
||||
res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
|
||||
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
|
||||
|
||||
res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256 (res_t1_8x32b,0x1));
|
||||
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
|
||||
res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits
|
||||
|
||||
|
||||
src_32x8b = _mm256_unpackhi_epi8(src_r1r3_32x8b, src_r2r4_32x8b);
|
||||
src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
|
||||
|
||||
res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
|
||||
res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
|
||||
|
||||
res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256 (res_t1_8x32b,0x1));
|
||||
res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
|
||||
res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
|
||||
res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits
|
||||
|
||||
|
||||
res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
|
||||
_mm_storeu_si128((__m128i *)(pu1_tmp1+16), res_16x8b);
|
||||
src_r0r2_32x8b = src_r2r4_32x8b;
|
||||
src_r1r3_32x8b = _mm256_set_m128i(src_r5_16x8b,_mm256_extracti128_si256(src_r1r3_32x8b,0x1));
|
||||
src_r2_16x8b = src_r4_16x8b;
|
||||
src_r4_16x8b = src_r6_16x8b;
|
||||
|
||||
ht_temp -= 2;
|
||||
pu1_pred_vert += src_strd << 1;
|
||||
pu1_tmp1 += 32;
|
||||
}
|
||||
while(ht_temp > 0);
|
||||
}
|
||||
//horizontal q-pel filter
|
||||
{
|
||||
|
||||
__m256i src_r0r1_32x8b,src_r0r1_sht_32x8b,src_r0r1_t1_32x8b,res_32x8b;
|
||||
__m128i src_vpel_16x8b,res_16x8b_128;
|
||||
|
||||
__m256i res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b;
|
||||
|
||||
//Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
|
||||
//Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
|
||||
//b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
|
||||
|
||||
do
|
||||
{
|
||||
src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + 8),
|
||||
(__m128i *)(pu1_pred_horiz)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
|
||||
//b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
|
||||
src_vpel_16x8b = _mm_loadu_si128((__m128i *)(pu1_tmp2));
|
||||
|
||||
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
|
||||
//b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
|
||||
|
||||
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
|
||||
//b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
|
||||
|
||||
res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
|
||||
//a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
|
||||
//b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
|
||||
//b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
|
||||
|
||||
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
|
||||
//b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
|
||||
|
||||
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
|
||||
//b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
|
||||
|
||||
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
|
||||
//b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
|
||||
|
||||
res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
|
||||
//a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
|
||||
//b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
|
||||
//b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
|
||||
|
||||
src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
|
||||
//b4 b5 b6 b7 b8 b9....b15 0 0 0 0
|
||||
|
||||
src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
|
||||
//b5 b6 b7 b8 b9....b15 0 0 0 0 0
|
||||
|
||||
src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
|
||||
//b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
|
||||
|
||||
res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
|
||||
//a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
|
||||
//b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
|
||||
//b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
|
||||
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b);
|
||||
res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b);
|
||||
res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b);
|
||||
res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5); //shifting right by 5 bits.
|
||||
|
||||
|
||||
res_32x8b = _mm256_packus_epi16(res_r0r1_t1_8x32b, res_r0r1_t1_8x32b);
|
||||
res_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(res_32x8b, 0xD8));
|
||||
res_16x8b_128 = _mm_avg_epu8(res_16x8b_128, src_vpel_16x8b);
|
||||
_mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b_128);
|
||||
|
||||
ht --;
|
||||
pu1_pred_horiz += src_strd;
|
||||
pu1_dst += dst_strd;
|
||||
pu1_tmp2 += 16;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
303
common/x86/ih264_iquant_itrans_recon_avx2.c
Normal file
303
common/x86/ih264_iquant_itrans_recon_avx2.c
Normal file
|
|
@ -0,0 +1,303 @@
|
|||
/******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2015 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at:
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*****************************************************************************
|
||||
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
||||
*/
|
||||
/**
|
||||
*******************************************************************************
|
||||
* @file
|
||||
* ih264_iquant_itrans_recon_avx2.c
|
||||
*
|
||||
* @brief
|
||||
* Contains function definitions for inverse quantization, inverse
|
||||
* transform and reconstruction
|
||||
*
|
||||
* @author
|
||||
* Priyanka Bose
|
||||
*
|
||||
* @par List of Functions:
|
||||
* - ih264_iquant_itrans_recon_4x4_avx2()
|
||||
*
|
||||
* @remarks
|
||||
* None
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
/* User include files */
|
||||
#include "ih264_typedefs.h"
|
||||
#include "ih264_defs.h"
|
||||
#include "ih264_trans_macros.h"
|
||||
#include "ih264_macros.h"
|
||||
#include "ih264_platform_macros.h"
|
||||
#include "ih264_trans_data.h"
|
||||
#include "ih264_size_defs.h"
|
||||
#include "ih264_structs.h"
|
||||
#include "ih264_trans_quant_itrans_iquant.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
|
||||
/*
|
||||
********************************************************************************
|
||||
*
|
||||
* @brief This function reconstructs a 4x4 sub block from quantized resiude and
|
||||
* prediction buffer
|
||||
*
|
||||
* @par Description:
|
||||
* The quantized residue is first inverse quantized, then inverse transformed.
|
||||
* This inverse transformed content is added to the prediction buffer to recon-
|
||||
* struct the end output
|
||||
*
|
||||
* @param[in] pi2_src
|
||||
* quantized 4x4 block
|
||||
*
|
||||
* @param[in] pu1_pred
|
||||
* prediction 4x4 block
|
||||
*
|
||||
* @param[out] pu1_out
|
||||
* reconstructed 4x4 block
|
||||
*
|
||||
* @param[in] src_strd
|
||||
* quantization buffer stride
|
||||
*
|
||||
* @param[in] pred_strd,
|
||||
* Prediction buffer stride
|
||||
*
|
||||
* @param[in] out_strd
|
||||
* recon buffer Stride
|
||||
*
|
||||
* @param[in] pu2_scaling_list
|
||||
* pointer to scaling list
|
||||
*
|
||||
* @param[in] pu2_norm_adjust
|
||||
* pointer to inverse scale matrix
|
||||
*
|
||||
* @param[in] u4_qp_div_6
|
||||
* Floor (qp/6)
|
||||
*
|
||||
* @param[in] pi4_tmp
|
||||
* temporary buffer of size 1*16
|
||||
*
|
||||
* @returns none
|
||||
*
|
||||
* @remarks none
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
void ih264_iquant_itrans_recon_4x4_avx2(WORD16 *pi2_src,
|
||||
UWORD8 *pu1_pred,
|
||||
UWORD8 *pu1_out,
|
||||
WORD32 pred_strd,
|
||||
WORD32 out_strd,
|
||||
const UWORD16 *pu2_iscal_mat,
|
||||
const UWORD16 *pu2_weigh_mat,
|
||||
UWORD32 u4_qp_div_6,
|
||||
WORD16 *pi2_tmp,
|
||||
WORD32 iq_start_idx,
|
||||
WORD16 *pi2_dc_ld_addr)
|
||||
{
|
||||
|
||||
UWORD32 *pu4_out = (UWORD32 *) pu1_out;
|
||||
__m256i src_r0_r1, src_r2_r3;
|
||||
__m256i src_r0, src_r1, src_r2, src_r3;
|
||||
__m256i scalemat_r0_r1, scalemat_r2_r3;
|
||||
__m128i pred_r0, pred_r1, pred_r2, pred_r3;
|
||||
__m256i sign_reg, dequant_r0_r1, dequant_r2_r3;
|
||||
__m256i zero_8x32b = _mm256_setzero_si256(); // all bits reset to zero
|
||||
__m256i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
__m256i resq_r0, resq_r1, resq_r2, resq_r3;
|
||||
__m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
|
||||
__m256i value_32 = _mm256_set1_epi32(32);
|
||||
__m128i value_32_128 = _mm_set1_epi32(32);
|
||||
__m128i r0,r1,r2,r3,t0,t1,t2,t3,t4,t5,t6,t7,sign_reg_128, de_r0_r1,de_r2_r3,r0_r1,r2_r3,scale_r0_r1,scale_r2_r3;
|
||||
__m128i zero_8x16b_128 = _mm_setzero_si128();
|
||||
UNUSED (pi2_tmp);
|
||||
|
||||
/*************************************************************/
|
||||
/* Dequantization of coefficients. Will be replaced by SIMD */
|
||||
/* operations on platform */
|
||||
/*************************************************************/
|
||||
src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
|
||||
scalemat_r0_r1 = _mm256_loadu_si256((__m256i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
|
||||
dequant_r0_r1 = _mm256_loadu_si256((__m256i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
|
||||
|
||||
temp0 = _mm256_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
|
||||
temp4 = _mm256_unpacklo_epi16(temp0, zero_8x32b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
|
||||
temp5 = _mm256_unpackhi_epi16(temp0, zero_8x32b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
|
||||
|
||||
src_r0 = _mm256_unpacklo_epi16(src_r0_r1, zero_8x32b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
|
||||
src_r1 = _mm256_unpackhi_epi16(src_r0_r1, zero_8x32b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
|
||||
|
||||
temp0 = _mm256_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
|
||||
temp1 = _mm256_madd_epi16(src_r1, temp5);
|
||||
|
||||
if (u4_qp_div_6 >= 4) {
|
||||
resq_r0 = _mm256_slli_epi32(temp0, u4_qp_div_6 - 4);
|
||||
resq_r1 = _mm256_slli_epi32(temp1, u4_qp_div_6 - 4);
|
||||
} else {
|
||||
temp4 = _mm256_add_epi32(temp0, add_rshift);
|
||||
temp5 = _mm256_add_epi32(temp1, add_rshift);
|
||||
resq_r0 = _mm256_srai_epi32(temp0, 4 - u4_qp_div_6);
|
||||
resq_r1 = _mm256_srai_epi32(temp1, 4 - u4_qp_div_6);
|
||||
}
|
||||
|
||||
if (iq_start_idx == 1)
|
||||
resq_r0 = _mm256_insert_epi32(resq_r0,(WORD32)pi2_dc_ld_addr[0],0);
|
||||
/* Perform Inverse transform */
|
||||
|
||||
/* Perform Inverse transform */
|
||||
/*-------------------------------------------------------------*/
|
||||
/* IDCT [ Horizontal transformation ] */
|
||||
/*-------------------------------------------------------------*/
|
||||
// Matrix transpose
|
||||
/*
|
||||
* a0 a1 a2 a3
|
||||
* b0 b1 b2 b3
|
||||
* c0 c1 c2 c3
|
||||
* d0 d1 d2 d3
|
||||
*/
|
||||
|
||||
temp0 = _mm256_unpacklo_epi32(resq_r0, resq_r1); //a0 c0 a1 c1 b0 d0 b1 d1
|
||||
temp1 = _mm256_unpackhi_epi32(resq_r0, resq_r1); //a2 c2
|
||||
|
||||
resq_r0 = _mm256_permute2f128_si256(temp0, temp1, 0x20);
|
||||
resq_r1 = _mm256_permute2f128_si256(temp0, temp1, 0x31);
|
||||
|
||||
temp0 = _mm256_unpacklo_epi64(resq_r0, resq_r1); // w0 w2
|
||||
temp1 = _mm256_unpackhi_epi64(resq_r0, resq_r1); // w1 w3
|
||||
|
||||
resq_r0 = _mm256_permute2f128_si256(temp0, temp1, 0x20);
|
||||
resq_r1 = _mm256_permute2f128_si256(temp0, temp1, 0x31);
|
||||
|
||||
r0 = _mm256_extracti128_si256(resq_r0, 0x0);
|
||||
r1 = _mm256_extracti128_si256(resq_r0, 0x1);
|
||||
r2 = _mm256_extracti128_si256(resq_r1, 0x0);
|
||||
r3 = _mm256_extracti128_si256(resq_r1, 0x1);
|
||||
|
||||
//Transform starts -- horizontal transform
|
||||
/*------------------------------------------------------------------*/
|
||||
/* z0 = w0 + w2 */
|
||||
t0 = _mm_add_epi32(r0, r2);
|
||||
/* z1 = w0 - w2 */
|
||||
t1 = _mm_sub_epi32(r0, r2);
|
||||
/* z2 = (w1 >> 1) - w3 */
|
||||
t2 = _mm_srai_epi32(r1, 1); //(w1>>1)
|
||||
t2 = _mm_sub_epi32(t2, r3); //(w1>>1) - w3
|
||||
/* z3 = w1 + (w3 >> 1) */
|
||||
t3 = _mm_srai_epi32(r3, 1); //(w3>>1) + w1
|
||||
t3 = _mm_add_epi32(t3, r1);
|
||||
/*----------------------------------------------------------*/
|
||||
/* x0 = z0 + z3 */
|
||||
r0 = _mm_add_epi32(t0, t3);
|
||||
/* x1 = z1 + z2 */
|
||||
r1 = _mm_add_epi32(t1, t2);
|
||||
/* x2 = z1 - z2 */
|
||||
r2 = _mm_sub_epi32(t1, t2);
|
||||
/* x3 = z0 - z3 */
|
||||
r3 = _mm_sub_epi32(t0, t3);
|
||||
|
||||
t1 = _mm_unpacklo_epi32(r0, r1); //a0 a1 b0 b1
|
||||
t3 = _mm_unpacklo_epi32(r2, r3); //a2 a3 b2 b3
|
||||
t2 = _mm_unpackhi_epi32(r0, r1); //c0 c1 d0 d1
|
||||
t4 = _mm_unpackhi_epi32(r2, r3); //c2 c3 d2 d3
|
||||
r0 = _mm_unpacklo_epi64(t1, t3); //a0 a1 a2 a3
|
||||
r1 = _mm_unpackhi_epi64(t1, t3); //b0 b1 b2 b3
|
||||
r2 = _mm_unpacklo_epi64(t2, t4); //c0 c1 c2 c3
|
||||
r3 = _mm_unpackhi_epi64(t2, t4); //d0 d1 d2 d3
|
||||
|
||||
//Transform ends -- horizontal transform
|
||||
|
||||
//Load pred buffer
|
||||
pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
|
||||
pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
|
||||
pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
|
||||
pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
|
||||
|
||||
pred_r0 = _mm_cvtepu8_epi32(pred_r0); //p00 p01 p02 p03 -- all 32 bits
|
||||
pred_r1 = _mm_cvtepu8_epi32(pred_r1); //p10 p11 p12 p13 -- all 32 bits ///Need to look
|
||||
pred_r2 = _mm_cvtepu8_epi32(pred_r2); //p20 p21 p22 p23 -- all 32 bits
|
||||
pred_r3 = _mm_cvtepu8_epi32(pred_r3); //p30 p31 p32 p33 -- all 32 bits
|
||||
|
||||
/*--------------------------------------------------------------*/
|
||||
/* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */
|
||||
/* */
|
||||
/* Add the prediction and store it back to same buffer */
|
||||
/*--------------------------------------------------------------*/
|
||||
|
||||
|
||||
t0 = _mm_add_epi32(r0, r2);
|
||||
/* z1j = y0j - y2j */
|
||||
t1 = _mm_sub_epi32(r0, r2);
|
||||
/* z2j = (y1j>>1) - y3j */
|
||||
t2 = _mm_srai_epi32(r1, 1); //(y1j>>1)
|
||||
t2 = _mm_sub_epi32(t2, r3);
|
||||
/* z3j = y1j + (y3j>>1) */
|
||||
t3 = _mm_srai_epi32(r3, 1); //(y3j>>1)
|
||||
t3 = _mm_add_epi32(r1, t3);
|
||||
|
||||
|
||||
t4 = _mm_add_epi32(t0, t3);
|
||||
t4 = _mm_add_epi32(t4, value_32_128);
|
||||
t4 = _mm_srai_epi32(t4, 6);
|
||||
t4 = _mm_add_epi32(t4, pred_r0);
|
||||
|
||||
t5 = _mm_add_epi32(t1, t2);
|
||||
t5 = _mm_add_epi32(t5, value_32_128);
|
||||
t5 = _mm_srai_epi32(t5, 6);
|
||||
t5 = _mm_add_epi32(t5, pred_r1);
|
||||
|
||||
t6 = _mm_sub_epi32(t1, t2);
|
||||
t6 = _mm_add_epi32(t6, value_32_128);
|
||||
t6 = _mm_srai_epi32(t6, 6);
|
||||
t6 = _mm_add_epi32(t6, pred_r2);
|
||||
|
||||
t7 = _mm_sub_epi32(t0, t3);
|
||||
t7 = _mm_add_epi32(t7, value_32_128);
|
||||
t7 = _mm_srai_epi32(t7, 6);
|
||||
t7 = _mm_add_epi32(t7, pred_r3);
|
||||
|
||||
|
||||
// 32-bit to 16-bit conversion
|
||||
t0 = _mm_packs_epi32(t4, t5);
|
||||
t1 = _mm_packs_epi32(t6, t7);
|
||||
|
||||
|
||||
//Clipping the results to 8 bits
|
||||
sign_reg_128 = _mm_cmpgt_epi16(t0, zero_8x16b_128); // sign check
|
||||
t0 = _mm_and_si128(t0, sign_reg_128);
|
||||
sign_reg_128 = _mm_cmpgt_epi16(t1, zero_8x16b_128);
|
||||
t1 = _mm_and_si128(t1, sign_reg_128);
|
||||
|
||||
r0 = _mm_packus_epi16(t0, t1);
|
||||
r1 = _mm_srli_si128(r0, 4);
|
||||
r2 = _mm_srli_si128(r1, 4);
|
||||
r3 = _mm_srli_si128(r2, 4);
|
||||
|
||||
*pu4_out = _mm_cvtsi128_si32(r0);
|
||||
pu1_out += out_strd;
|
||||
pu4_out = (UWORD32 *) (pu1_out);
|
||||
*(pu4_out) = _mm_cvtsi128_si32(r1);
|
||||
pu1_out += out_strd;
|
||||
pu4_out = (UWORD32 *) (pu1_out);
|
||||
*(pu4_out) = _mm_cvtsi128_si32(r2);
|
||||
pu1_out += out_strd;
|
||||
pu4_out = (UWORD32 *) (pu1_out);
|
||||
*(pu4_out) = _mm_cvtsi128_si32(r3);
|
||||
}
|
||||
501
common/x86/ih264_weighted_pred_avx2.c
Normal file
501
common/x86/ih264_weighted_pred_avx2.c
Normal file
|
|
@ -0,0 +1,501 @@
|
|||
/******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2015 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at:
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*****************************************************************************
|
||||
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
||||
*/
|
||||
/*****************************************************************************/
|
||||
/*****************************************************************************/
|
||||
/* File Includes */
|
||||
/*****************************************************************************/
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "ih264_typedefs.h"
|
||||
#include "ih264_macros.h"
|
||||
#include "ih264_platform_macros.h"
|
||||
#include "ih264_weighted_pred.h"
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
/*****************************************************************************/
|
||||
/* */
|
||||
/* Function Name : ih264_weighted_bi_pred_luma_avx2 */
|
||||
/* */
|
||||
/* Description : This function performs the weighted biprediction as */
|
||||
/* described in sec 8.4.2.3.2 titled "Weighted sample */
|
||||
/* prediction process" for luma. The function gets two */
|
||||
/* ht x wd blocks, weights them, adds them, rounds off the */
|
||||
/* sum, offsets it, saturates it to unsigned 8-bit and */
|
||||
/* stores it in the destination block. (ht,wd) can be */
|
||||
/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
|
||||
/* */
|
||||
/* Inputs : pu1_src1 - Pointer to source 1 */
|
||||
/* pu1_src2 - Pointer to source 2 */
|
||||
/* pu1_dst - Pointer to destination */
|
||||
/* src_strd1 - stride for source 1 */
|
||||
/* src_strd2 - stride for source 2 */
|
||||
/* dst_strd2 - stride for destination */
|
||||
/* log_wd - number of bits to be rounded off */
|
||||
/* wt1 - weight value for source 1 */
|
||||
/* wt2 - weight value for source 2 */
|
||||
/* ofst1 - offset value for source 1 */
|
||||
/* ofst2 - offset value for source 2 */
|
||||
/* ht - height of the block */
|
||||
/* wd - width of the block */
|
||||
/* */
|
||||
/* Issues : None */
|
||||
/* */
|
||||
/* Revision History: */
|
||||
/* */
|
||||
/* DD MM YYYY Author(s) Changes */
|
||||
/* 04 02 2015 Kaushik Initial Version */
|
||||
/* Senthoor */
|
||||
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
|
||||
/*****************************************************************************/
|
||||
void ih264_weighted_bi_pred_luma_avx2(UWORD8 *pu1_src1,
|
||||
UWORD8 *pu1_src2,
|
||||
UWORD8 *pu1_dst,
|
||||
WORD32 src_strd1,
|
||||
WORD32 src_strd2,
|
||||
WORD32 dst_strd,
|
||||
WORD32 log_wd,
|
||||
WORD32 wt1,
|
||||
WORD32 wt2,
|
||||
WORD32 ofst1,
|
||||
WORD32 ofst2,
|
||||
WORD32 ht,
|
||||
WORD32 wd)
|
||||
{
|
||||
|
||||
__m256i wt1_8x32b, wt2_8x32b;
|
||||
__m256i ofst_8x32b, round_8x32b;
|
||||
__m256i zero;
|
||||
zero = _mm256_set1_epi8(0);
|
||||
|
||||
WORD32 ofst;
|
||||
WORD32 round_val, shft;
|
||||
|
||||
wt1 = (WORD16)(wt1 & 0xffff);
|
||||
wt2 = (WORD16)(wt2 & 0xffff);
|
||||
round_val = 1 << log_wd;
|
||||
shft = log_wd + 1;
|
||||
ofst1 = (WORD8)(ofst1 & 0xff);
|
||||
ofst2 = (WORD8)(ofst2 & 0xff);
|
||||
ofst = (ofst1 + ofst2 + 1) >> 1;
|
||||
|
||||
wt1_8x32b = _mm256_set1_epi16(wt1);
|
||||
wt2_8x32b = _mm256_set1_epi16(wt2);
|
||||
round_8x32b = _mm256_set1_epi16(round_val);
|
||||
ofst_8x32b = _mm256_set1_epi16(ofst);
|
||||
|
||||
|
||||
if(wd == 4)
|
||||
{
|
||||
__m128i y1_2_16x8b, y1_3_16x8b;
|
||||
__m128i y2_2_16x8b, y2_3_16x8b;
|
||||
|
||||
__m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b,y2_1_8x32b,y2_0_8x32b;
|
||||
__m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_1_16x8b_128,y1_2_16x8b_128,y1_3_16x8b_128;
|
||||
|
||||
do
|
||||
{
|
||||
y1_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1));
|
||||
y1_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1));
|
||||
|
||||
y2_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2));
|
||||
y2_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2));
|
||||
|
||||
y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
|
||||
y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
|
||||
y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
|
||||
y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);
|
||||
|
||||
y1_0_32x8b = _mm256_unpacklo_epi32(y1_02_32x8b, y1_13_32x8b);
|
||||
y2_0_32x8b = _mm256_unpacklo_epi32(y2_02_32x8b, y2_13_32x8b);
|
||||
y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y1_0_32x8b, 0xD8));
|
||||
y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y2_0_32x8b, 0xD8));
|
||||
|
||||
y1_0_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128); // 8 to 16
|
||||
y2_0_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);
|
||||
|
||||
y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
|
||||
y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);
|
||||
|
||||
y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);
|
||||
|
||||
y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);
|
||||
|
||||
y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);
|
||||
|
||||
y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(y1_0_8x32b, y1_0_8x32b));
|
||||
y1_2_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 4);
|
||||
y1_1_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 8);
|
||||
y1_3_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 12);
|
||||
|
||||
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b_128);
|
||||
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b_128);
|
||||
*((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b_128);
|
||||
*((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b_128);
|
||||
|
||||
ht -= 4;
|
||||
pu1_src1 += src_strd1 << 2;
|
||||
pu1_src2 += src_strd2 << 2;
|
||||
pu1_dst += dst_strd << 2;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
else if(wd == 8)
|
||||
{
|
||||
__m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_2_16x8b_128,y1_1_16x8b_128,y1_3_16x8b_128;
|
||||
__m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b;
|
||||
__m256i y1_1_8x32b,y2_0_8x32b,y2_1_8x32b;
|
||||
|
||||
do
|
||||
{
|
||||
|
||||
y1_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1));
|
||||
y1_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1));
|
||||
|
||||
y2_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2));
|
||||
y2_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2));
|
||||
|
||||
y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
|
||||
y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
|
||||
y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
|
||||
y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);
|
||||
|
||||
y1_0_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, y1_13_32x8b);
|
||||
y2_0_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, y2_13_32x8b);
|
||||
|
||||
y1_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y1_0_32x8b));
|
||||
y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,0x1));
|
||||
y1_1_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128);
|
||||
|
||||
y2_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y2_0_32x8b));
|
||||
y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y2_0_32x8b,y2_0_32x8b,0x1));
|
||||
y2_1_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);
|
||||
|
||||
|
||||
y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
|
||||
y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);
|
||||
y1_1_8x32b = _mm256_mullo_epi16(y1_1_8x32b, wt1_8x32b);
|
||||
y2_1_8x32b = _mm256_mullo_epi16(y2_1_8x32b, wt2_8x32b);
|
||||
|
||||
y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);
|
||||
y1_1_8x32b = _mm256_adds_epi16(y1_1_8x32b, y2_1_8x32b);
|
||||
|
||||
y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);
|
||||
y1_1_8x32b = _mm256_srai_epi16(y1_1_8x32b, shft);
|
||||
|
||||
y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);
|
||||
y1_1_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_1_8x32b);
|
||||
|
||||
y1_0_32x8b = _mm256_packus_epi16(y1_0_8x32b, y1_1_8x32b);
|
||||
y1_0_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
|
||||
y1_2_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));
|
||||
|
||||
y1_0_32x8b = _mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,1);
|
||||
y1_1_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
|
||||
y1_3_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));
|
||||
|
||||
_mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b_128);
|
||||
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b_128);
|
||||
_mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b_128);
|
||||
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b_128);
|
||||
|
||||
ht -= 4;
|
||||
pu1_src1 += src_strd1 << 2;
|
||||
pu1_src2 += src_strd2 << 2;
|
||||
pu1_dst += dst_strd << 2;
|
||||
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
else // wd == 16
|
||||
{
|
||||
__m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
|
||||
__m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;
|
||||
|
||||
__m256i zero_32x8b,y1_0_32x8b,y2_0_32x8b;
|
||||
zero_32x8b = _mm256_set1_epi8(0);
|
||||
|
||||
do
|
||||
{
|
||||
|
||||
y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
|
||||
y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);
|
||||
|
||||
y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
|
||||
y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);
|
||||
|
||||
y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b,zero_32x8b);
|
||||
y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);
|
||||
|
||||
y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
|
||||
y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);
|
||||
|
||||
y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
|
||||
y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);
|
||||
|
||||
y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
|
||||
y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);
|
||||
|
||||
y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
|
||||
y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);
|
||||
|
||||
y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
|
||||
y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);
|
||||
|
||||
y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
|
||||
y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);
|
||||
|
||||
y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);
|
||||
|
||||
_mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);
|
||||
|
||||
ht -= 2;
|
||||
pu1_src1 += src_strd1 << 1;
|
||||
pu1_src2 += src_strd2 << 1;
|
||||
pu1_dst += dst_strd << 1;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*****************************************************************************/
|
||||
/* */
|
||||
/* Function Name : ih264_weighted_bi_pred_chroma_avx2 */
|
||||
/* */
|
||||
/* Description : This function performs the weighted biprediction as */
|
||||
/* described in sec 8.4.2.3.2 titled "Weighted sample */
|
||||
/* prediction process" for chroma. The function gets two */
|
||||
/* ht x wd blocks, weights them, adds them, rounds off the */
|
||||
/* sum, offsets it, saturates it to unsigned 8-bit and */
|
||||
/* stores it in the destination block. (ht,wd) can be */
|
||||
/* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */
|
||||
/* */
|
||||
/* Inputs : pu1_src1 - Pointer to source 1 */
|
||||
/* pu1_src2 - Pointer to source 2 */
|
||||
/* pu1_dst - Pointer to destination */
|
||||
/* src_strd1 - stride for source 1 */
|
||||
/* src_strd2 - stride for source 2 */
|
||||
/* dst_strd2 - stride for destination */
|
||||
/* log_wd - number of bits to be rounded off */
|
||||
/* wt1 - weight values for u and v in source 1 */
|
||||
/* wt2 - weight values for u and v in source 2 */
|
||||
/* ofst1 - offset value for u and v in source 1 */
|
||||
/* ofst2 - offset value for u and v in source 2 */
|
||||
/* ht - height of the block */
|
||||
/* wd - width of the block */
|
||||
/* */
|
||||
/* Issues : None */
|
||||
/* */
|
||||
/* Revision History: */
|
||||
/* */
|
||||
/* DD MM YYYY Author(s) Changes */
|
||||
/* 04 02 2015 Kaushik Initial Version */
|
||||
/* Senthoor */
|
||||
/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */
|
||||
/*****************************************************************************/
|
||||
void ih264_weighted_bi_pred_chroma_avx2(UWORD8 *pu1_src1,
|
||||
UWORD8 *pu1_src2,
|
||||
UWORD8 *pu1_dst,
|
||||
WORD32 src_strd1,
|
||||
WORD32 src_strd2,
|
||||
WORD32 dst_strd,
|
||||
WORD32 log_wd,
|
||||
WORD32 wt1,
|
||||
WORD32 wt2,
|
||||
WORD32 ofst1,
|
||||
WORD32 ofst2,
|
||||
WORD32 ht,
|
||||
WORD32 wd)
|
||||
{
|
||||
|
||||
__m128i y1_0_16x8b, y1_1_16x8b;
|
||||
__m128i y2_0_16x8b, y2_1_16x8b;
|
||||
|
||||
__m128i wt1_8x16b, wt2_8x16b;
|
||||
__m128i ofst_8x16b, round_8x16b;
|
||||
|
||||
WORD32 ofst1_u, ofst2_u, ofst_u;
|
||||
WORD32 ofst1_v, ofst2_v, ofst_v;
|
||||
WORD32 round_val, shft, ofst_val,ofst_val_256;
|
||||
|
||||
round_val = 1 << log_wd;
|
||||
shft = log_wd + 1;
|
||||
|
||||
ofst1_u = (WORD8)(ofst1 & 0xff);
|
||||
ofst1_v = (WORD8)(ofst1 >> 8);
|
||||
ofst2_u = (WORD8)(ofst2 & 0xff);
|
||||
ofst2_v = (WORD8)(ofst2 >> 8);
|
||||
|
||||
wt1_8x16b = _mm_set1_epi32(wt1);
|
||||
wt2_8x16b = _mm_set1_epi32(wt2);
|
||||
|
||||
ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
|
||||
ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
|
||||
ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
|
||||
ofst_val_256 = (ofst_u & 0xffff) | (ofst_v << 16);
|
||||
|
||||
round_8x16b = _mm_set1_epi16(round_val);
|
||||
ofst_8x16b = _mm_set1_epi32(ofst_val);
|
||||
|
||||
if(wd == 2)
|
||||
{
|
||||
__m128i y1_0_8x16b, y2_0_8x16b;
|
||||
|
||||
do
|
||||
{
|
||||
y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); //Loading 64 bits from diff location
|
||||
y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
|
||||
|
||||
y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
|
||||
y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
|
||||
|
||||
y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
|
||||
y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
|
||||
|
||||
y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
|
||||
y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
|
||||
|
||||
y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
|
||||
y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
|
||||
|
||||
y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
|
||||
y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
|
||||
|
||||
y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
|
||||
y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
|
||||
|
||||
y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
|
||||
y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
|
||||
|
||||
*((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
|
||||
*((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
|
||||
|
||||
ht -= 2;
|
||||
pu1_src1 += src_strd1 << 1;
|
||||
pu1_src2 += src_strd2 << 1;
|
||||
pu1_dst += dst_strd << 1;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
else if(wd == 4)
|
||||
{
|
||||
__m128i y1_0_8x16b, y1_1_8x16b;
|
||||
__m128i y2_0_8x16b, y2_1_8x16b;
|
||||
|
||||
do
|
||||
{
|
||||
y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); //Loading 64 bits from diff location
|
||||
y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
|
||||
|
||||
y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
|
||||
y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
|
||||
|
||||
y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
|
||||
y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
|
||||
|
||||
y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
|
||||
y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
|
||||
|
||||
y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
|
||||
y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
|
||||
y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
|
||||
y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
|
||||
|
||||
y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
|
||||
y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
|
||||
|
||||
y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
|
||||
y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
|
||||
|
||||
y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
|
||||
y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
|
||||
|
||||
y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
|
||||
y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
|
||||
|
||||
y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
|
||||
y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
|
||||
|
||||
_mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
|
||||
_mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
|
||||
|
||||
ht -= 2;
|
||||
pu1_src1 += src_strd1 << 1;
|
||||
pu1_src2 += src_strd2 << 1;
|
||||
pu1_dst += dst_strd << 1;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
else // wd == 8
|
||||
{
|
||||
__m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
|
||||
__m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;
|
||||
__m256i y1_0_32x8b,y2_0_32x8b,ofst_8x32b,round_8x32b;
|
||||
__m256i wt1_8x32b, wt2_8x32b;
|
||||
__m256i zero_32x8b;
|
||||
|
||||
wt1_8x32b = _mm256_set1_epi16(wt1);
|
||||
wt2_8x32b = _mm256_set1_epi16(wt2);
|
||||
round_8x32b = _mm256_set1_epi16(round_val);
|
||||
ofst_8x32b = _mm256_set1_epi32(ofst_val_256);
|
||||
zero_32x8b = _mm256_set1_epi8(0);
|
||||
|
||||
do
|
||||
{
|
||||
y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
|
||||
y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);
|
||||
y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
|
||||
y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);
|
||||
y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b, zero_32x8b);
|
||||
y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);
|
||||
y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
|
||||
y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);
|
||||
|
||||
y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
|
||||
y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);
|
||||
|
||||
y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
|
||||
y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);
|
||||
|
||||
y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
|
||||
y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);
|
||||
|
||||
y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
|
||||
y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);
|
||||
|
||||
y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
|
||||
y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);
|
||||
|
||||
|
||||
y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);
|
||||
_mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);
|
||||
|
||||
ht -= 2;
|
||||
pu1_src1 += src_strd1 << 1;
|
||||
pu1_src2 += src_strd2 << 1;
|
||||
pu1_dst += dst_strd << 1;
|
||||
}
|
||||
while(ht > 0);
|
||||
}
|
||||
}
|
||||
|
|
@ -67,5 +67,6 @@ void ih264d_init_function_ptr_sse42(dec_struct_t *ps_codec);
|
|||
|
||||
void ih264d_init_function_ptr_a9q(dec_struct_t *ps_codec);
|
||||
void ih264d_init_function_ptr_av8(dec_struct_t *ps_codec);
|
||||
void ih264d_init_function_ptr_avx2(dec_struct_t *ps_codec);
|
||||
|
||||
#endif /* _IH264D_FUNCTION_SELECTOR_H_ */
|
||||
|
|
|
|||
|
|
@ -46,7 +46,8 @@ else()
|
|||
list(
|
||||
APPEND LIBAVCDEC_SRCS "${AVC_ROOT}/decoder/x86/ih264d_function_selector.c"
|
||||
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_sse42.c"
|
||||
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c")
|
||||
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c"
|
||||
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c")
|
||||
endif()
|
||||
|
||||
add_library(libavcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS}
|
||||
|
|
|
|||
|
|
@ -54,7 +54,8 @@ else()
|
|||
list(
|
||||
APPEND LIBMVCDEC_ASMS "${AVC_ROOT}/decoder/x86/ih264d_function_selector.c"
|
||||
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_sse42.c"
|
||||
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c")
|
||||
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c"
|
||||
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c")
|
||||
endif()
|
||||
|
||||
add_library(libmvcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS}
|
||||
|
|
|
|||
|
|
@ -95,7 +95,8 @@ else()
|
|||
"${AVC_ROOT}/decoder/x86/svc/isvcd_iquant_itrans_residual_sse42.c"
|
||||
"${AVC_ROOT}/decoder/x86/svc/isvcd_iquant_itrans_sse42.c"
|
||||
"${AVC_ROOT}/decoder/x86/svc/isvcd_pred_residual_recon_sse42.c"
|
||||
"${AVC_ROOT}/decoder/x86/svc/isvcd_residual_resamp_sse42.c")
|
||||
"${AVC_ROOT}/decoder/x86/svc/isvcd_residual_resamp_sse42.c"
|
||||
"${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c")
|
||||
endif()
|
||||
|
||||
add_library(libsvcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS}
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@
|
|||
#include "ih264_error.h"
|
||||
#include "ih264_trans_quant_itrans_iquant.h"
|
||||
#include "ih264_inter_pred_filters.h"
|
||||
#include "ih264_deblk_edge_filters.h"
|
||||
|
||||
#include "ih264d_structs.h"
|
||||
#include "ih264d_function_selector.h"
|
||||
|
|
@ -68,6 +69,11 @@ void ih264d_init_function_ptr(dec_struct_t *ps_codec)
|
|||
case ARCH_X86_SSSE3:
|
||||
ih264d_init_function_ptr_ssse3(ps_codec);
|
||||
break;
|
||||
case ARCH_X86_AVX2:
|
||||
ih264d_init_function_ptr_ssse3(ps_codec);
|
||||
ih264d_init_function_ptr_sse42(ps_codec);
|
||||
ih264d_init_function_ptr_avx2(ps_codec);
|
||||
break;
|
||||
case ARCH_X86_SSE42:
|
||||
default:
|
||||
ih264d_init_function_ptr_ssse3(ps_codec);
|
||||
|
|
@ -83,7 +89,7 @@ void ih264d_init_arch(dec_struct_t *ps_codec)
|
|||
#elif DEFAULT_ARCH == D_ARCH_X86_SSSE3
|
||||
ps_codec->e_processor_arch = ARCH_X86_SSSE3;
|
||||
#elif DEFAULT_ARCH == D_ARCH_X86_AVX2
|
||||
ps_codec->e_processor_arch = D_ARCH_X86_AVX2;
|
||||
ps_codec->e_processor_arch = ARCH_X86_AVX2;
|
||||
#else
|
||||
ps_codec->e_processor_arch = ARCH_X86_GENERIC;
|
||||
#endif
|
||||
|
|
|
|||
102
decoder/x86/ih264d_function_selector_avx2.c
Normal file
102
decoder/x86/ih264d_function_selector_avx2.c
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
/******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2015 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at:
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*****************************************************************************
|
||||
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
||||
*/
|
||||
/**
|
||||
*******************************************************************************
|
||||
* @file
|
||||
* ih264e_function_selector_generic.c
|
||||
*
|
||||
* @brief
|
||||
* Contains functions to initialize function pointers of codec context
|
||||
*
|
||||
* @author
|
||||
* Ittiam
|
||||
*
|
||||
* @par List of Functions:
|
||||
* - ih264e_init_function_ptr_generic
|
||||
*
|
||||
* @remarks
|
||||
* None
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
/*****************************************************************************/
|
||||
/* File Includes */
|
||||
/*****************************************************************************/
|
||||
|
||||
/* System Include files */
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* User Include files */
|
||||
#include "ih264_typedefs.h"
|
||||
#include "iv.h"
|
||||
#include "ivd.h"
|
||||
#include "ih264_defs.h"
|
||||
#include "ih264_size_defs.h"
|
||||
#include "ih264_error.h"
|
||||
#include "ih264_trans_quant_itrans_iquant.h"
|
||||
#include "ih264_inter_pred_filters.h"
|
||||
#include "ih264_trans_quant_itrans_iquant.h"
|
||||
#include "ih264_weighted_pred.h"
|
||||
#include "ih264d_structs.h"
|
||||
|
||||
#include "ih264_deblk_edge_filters.h"
|
||||
/**
|
||||
*******************************************************************************
|
||||
*
|
||||
* @brief Initialize the intra/inter/transform/deblk function pointers of
|
||||
* codec context
|
||||
*
|
||||
* @par Description: the current routine initializes the function pointers of
|
||||
* codec context basing on the architecture in use
|
||||
*
|
||||
* @param[in] ps_codec
|
||||
* Codec context pointer
|
||||
*
|
||||
* @returns none
|
||||
*
|
||||
* @remarks none
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
void ih264d_init_function_ptr_avx2(dec_struct_t *ps_codec)
|
||||
{
|
||||
|
||||
UNUSED(ps_codec);
|
||||
ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_avx2;
|
||||
ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_avx2;
|
||||
ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_avx2;
|
||||
//ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_avx2;
|
||||
ps_codec->apf_inter_pred_luma[5] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
|
||||
ps_codec->apf_inter_pred_luma[7] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
|
||||
ps_codec->apf_inter_pred_luma[13] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
|
||||
ps_codec->apf_inter_pred_luma[15] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
|
||||
ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_avx2;
|
||||
ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_avx2;
|
||||
ps_codec->apf_inter_pred_luma[0] = ih264_inter_pred_luma_copy_avx2;
|
||||
ps_codec->pf_iquant_itrans_recon_luma_4x4 = ih264_iquant_itrans_recon_4x4_avx2;
|
||||
ps_codec->pf_weighted_bi_pred_luma = ih264_weighted_bi_pred_luma_avx2;
|
||||
ps_codec->pf_weighted_bi_pred_chroma = ih264_weighted_bi_pred_chroma_avx2;
|
||||
return;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue