diff --git a/Android.bp b/Android.bp index a8e8a4d..16f518a 100644 --- a/Android.bp +++ b/Android.bp @@ -236,6 +236,56 @@ cc_defaults { }, } +cc_library_static { + name: "libavc_avx2", + defaults: ["libavc_dec_defaults"], + visibility: ["//visibility:private"], + export_include_dirs: [ + "common", + "decoder", + ], + arch: { + x86: { + cflags: [ + "-mavx2", + "-mfma", + ], + srcs: [ + "common/x86/ih264_ihadamard_scaling_avx2.c", + "common/x86/ih264_deblk_chroma_avx2.c", + "common/x86/ih264_deblk_luma_avx2.c", + "common/x86/ih264_iquant_itrans_recon_avx2.c", + "common/x86/ih264_weighted_pred_avx2.c", + "common/x86/ih264_inter_pred_filters_avx2.c", + "decoder/x86/ih264d_function_selector_avx2.c", + ], + }, + x86_64: { + cflags: [ + "-mavx2", + "-mfma", + ], + srcs: [ + "common/x86/ih264_ihadamard_scaling_avx2.c", + "common/x86/ih264_deblk_chroma_avx2.c", + "common/x86/ih264_deblk_luma_avx2.c", + "common/x86/ih264_iquant_itrans_recon_avx2.c", + "common/x86/ih264_weighted_pred_avx2.c", + "common/x86/ih264_inter_pred_filters_avx2.c", + "decoder/x86/ih264d_function_selector_avx2.c", + ], + }, + }, + sanitize: { + blocklist: "libavc_blocklist.txt", + }, + apex_available: [ + "//apex_available:platform", // used by libstagefright_soft_avcdec + "com.android.media.swcodec", + ], + min_sdk_version: "29", +} + cc_library_static { name: "libavcdec", defaults: ["libavc_dec_defaults"], @@ -380,6 +430,10 @@ cc_library_static { "decoder/x86/ih264d_function_selector_sse42.c", "decoder/x86/ih264d_function_selector_ssse3.c", ], + whole_static_libs: [ + "libavc_avx2", + ], + }, x86_64: { @@ -400,6 +454,9 @@ cc_library_static { "decoder/x86/ih264d_function_selector_sse42.c", "decoder/x86/ih264d_function_selector_ssse3.c", ], + whole_static_libs: [ + "libavc_avx2", + ], }, }, diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 85f98ac..003db87 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -8,7 +8,7 @@ function(libavc_add_compile_options) elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch32") add_compile_options(-march=armv7-a -mfpu=neon) else() - add_compile_options(-msse4.2 -mno-avx) + add_compile_options(-msse4.2 -mavx2 -mfma) endif() add_compile_options(-Wdeclaration-after-statement) @@ -45,8 +45,8 @@ function(libavc_add_definitions) elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch32") add_definitions(-DARMV7 -DDEFAULT_ARCH=D_ARCH_ARM_A9Q) else() - add_definitions(-DX86 -DX86_LINUX=1 -DDISABLE_AVX2 - -DDEFAULT_ARCH=D_ARCH_X86_SSE42) + add_definitions(-DX86 -DX86_LINUX=1 + -DDEFAULT_ARCH=D_ARCH_X86_SSE42) endif() endfunction() diff --git a/common/common.cmake b/common/common.cmake index 4b3e8bb..a750f09 100644 --- a/common/common.cmake +++ b/common/common.cmake @@ -108,7 +108,13 @@ else() "${AVC_ROOT}/common/x86/ih264_mem_fns_ssse3.c" "${AVC_ROOT}/common/x86/ih264_padding_ssse3.c" "${AVC_ROOT}/common/x86/ih264_resi_trans_quant_sse42.c" - "${AVC_ROOT}/common/x86/ih264_weighted_pred_sse42.c") + "${AVC_ROOT}/common/x86/ih264_weighted_pred_sse42.c" + "${AVC_ROOT}/common/x86/ih264_ihadamard_scaling_avx2.c" + "${AVC_ROOT}/common/x86/ih264_deblk_chroma_avx2.c" + "${AVC_ROOT}/common/x86/ih264_deblk_luma_avx2.c" + "${AVC_ROOT}/common/x86/ih264_iquant_itrans_recon_avx2.c" + "${AVC_ROOT}/common/x86/ih264_weighted_pred_avx2.c" + "${AVC_ROOT}/common/x86/ih264_inter_pred_filters_avx2.c") include_directories(${AVC_ROOT}/common/x86) endif() diff --git a/common/ih264_deblk_edge_filters.h b/common/ih264_deblk_edge_filters.h index 455d4b0..9ab9d9d 100644 --- a/common/ih264_deblk_edge_filters.h +++ b/common/ih264_deblk_edge_filters.h @@ -159,4 +159,12 @@ ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_ssse3; ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_ssse3; ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff_ssse3; + +/* AVX2 */ +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_avx2; +ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_avx2; +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4_avx2; +ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_avx2; + + #endif /* _IH264_DEBLK_EDGE_FILTERS_H_ */ diff --git a/common/ih264_inter_pred_filters.h b/common/ih264_inter_pred_filters.h index f42222f..4ddc490 100644 --- a/common/ih264_inter_pred_filters.h +++ b/common/ih264_inter_pred_filters.h @@ -138,4 +138,11 @@ ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3; ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3; ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_ssse3; +/* AVX2 Intrinsic Declarations */ + +ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_avx2; +ih264_inter_pred_luma_ft ih264_inter_pred_luma_copy_avx2; +ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2; + + #endif /* _IH264_INTER_PRED_FILTERS_H_ */ diff --git a/common/ih264_trans_quant_itrans_iquant.h b/common/ih264_trans_quant_itrans_iquant.h index f629382..bd68490 100644 --- a/common/ih264_trans_quant_itrans_iquant.h +++ b/common/ih264_trans_quant_itrans_iquant.h @@ -231,4 +231,10 @@ ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_sse42; ih264_hadamard_quant_ft ih264_hadamard_quant_4x4_sse42; ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv_sse42; +/*AVX2 Declarations*/ +ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_avx2; +ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv_avx2; +ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_avx2; + + #endif /* _IH264_TRANS_QUANT_ITRANS_IQUANT_H_ */ diff --git a/common/ih264_weighted_pred.h b/common/ih264_weighted_pred.h index 44a7aa6..5527a37 100644 --- a/common/ih264_weighted_pred.h +++ b/common/ih264_weighted_pred.h @@ -106,5 +106,10 @@ ih264_weighted_pred_ft ih264_weighted_pred_chroma_sse42; ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_sse42; ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_sse42; + +/* AVX2 */ +ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_avx2; +ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_avx2; + #endif /* _IH264_WEIGHTED_PRED_H_ */ diff --git a/common/x86/ih264_deblk_chroma_avx2.c b/common/x86/ih264_deblk_chroma_avx2.c new file mode 100644 index 0000000..a473dd8 --- /dev/null +++ b/common/x86/ih264_deblk_chroma_avx2.c @@ -0,0 +1,386 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include + +#ifdef __ANDROID__ +#include "log/log.h" +#include +#endif + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_platform_macros.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_macros.h" + +#include +#include +#include + + + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_vert_bslt4_avx2() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* vertical edge when the boundary strength is less than 4 */ +/* in high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab_cb - tc0_table for U */ +/* pu1_cliptab_cr - tc0_table for V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264 with alpha and beta values different */ +/* in U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ +/*****************************************************************************/ + +void ih264_deblk_chroma_vert_bslt4_avx2(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab_cb, + const UWORD8 *pu1_cliptab_cr) +{ + UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ + UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; + WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; + WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; + __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; + __m256i lineab, linecd, lineef, linegh, lineae, linebf, linecg, linedh; + __m256i temp1, temp2, temp3, temp4; + __m256i t1,t3, t2,t4,pq0_uv_32x8,pq1_uv_32x8,tmp1,tmp2,p0_uv_8x32,q0_uv_8x32; + + __m256i pq0_uv_8x32, pq1_uv_8x32, p1_uv_8x32,pq0_uv_8x32_1,pq0_uv_8x32_2; + __m256i flag_bs, flag1, flag2; + __m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro; + __m256i zero = _mm256_setzero_si256(); + __m256i C0_uv_8x32; + __m256i p0_uv_8x32_1, p0_uv_8x32_2, q0_uv_8x32_1, q0_uv_8x32_2,p0_uv_32x8_1,q0_uv_32x8_1; + + u1_Bs0 = (u4_bs >> 24) & 0xff; + u1_Bs1 = (u4_bs >> 16) & 0xff; + u1_Bs2 = (u4_bs >> 8) & 0xff; + u1_Bs3 = (u4_bs >> 0) & 0xff; + + flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, + u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2, + u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, + u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); + flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s + flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask + + /* Load and transpose the pixel values */ + lineab = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + src_strd), (__m128i *)(pu1_src_uv - 4)); + linecd = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), (__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); + lineef = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), (__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); + linegh = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), (__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); + + temp1 = _mm256_unpacklo_epi64(lineab, zero); //a0 -- a7 000.. b0..b7 000 + temp2 = _mm256_unpacklo_epi64(linecd, zero); + temp3 = _mm256_unpacklo_epi64(lineef, zero); //e0 -- e7 000.. f0..f7 000 + temp4 = _mm256_unpacklo_epi64(linegh, zero); + + temp1 = _mm256_unpacklo_epi16(temp1, temp2); //a0 a1 c0 c1 -- a6 a7 c6 c7 b0 b1 d0 d1.. b6 b7 d6 d7 + temp2 = _mm256_unpacklo_epi16(temp3, temp4); //e0 e1 g0 g1 f0 f1 h0 h1 + + t2 = _mm256_permute2f128_si256(temp1, temp2, 0x20); + t3 = _mm256_permute2f128_si256(temp1, temp2, 0x31); + + tmp1 = _mm256_unpacklo_epi16(t2, t3); //a0 a1 b0 b1 c0 c1 d0 d1 -a2 a3 b2 b3 .... e0 e1 f0 f1 g0 g1 h0 h1 -e2 e3.. + tmp2 = _mm256_unpackhi_epi16(t2, t3); //a4 a5 b4 b5 -a6 a7 b6 b7 + + + temp1 = _mm256_unpacklo_epi8(tmp1,zero); // a0 0 a1 0 b0 0 b1 0 c0 0 c1 0 d0 0 d1 0 - e0 0 e1 0 .. => p1 + temp2 = _mm256_unpackhi_epi8(tmp1,zero); // a2 0 a3 0 => p0 + temp3 = _mm256_unpacklo_epi8(tmp2,zero); //a4 0 a5 0 => q0 + temp4 = _mm256_unpackhi_epi8(tmp2,zero); //a6 0 a7 0 => q1 + + pq1_uv_32x8 = _mm256_packus_epi16(temp1,temp4); // 0213 + pq0_uv_32x8 = _mm256_packus_epi16(temp2,temp3); //0213 + + diff = _mm256_subs_epi16(temp2, temp3); //Condn 1 (p0 -q0) - set (3), set(3) + diff = _mm256_abs_epi16(diff); + alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr); + flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff); + + diff = _mm256_subs_epi16(temp4, temp3); //Condtn 2 (q1 -q0) + diff = _mm256_abs_epi16(diff); + beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr); + flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff)); + + + diff = _mm256_subs_epi16(temp1, temp2); //Condtn 3 (p1 -p0) + diff = _mm256_abs_epi16(diff); + flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff)); + + diff = _mm256_subs_epi16(temp3, temp2); //(q0 -p0) + diff = _mm256_slli_epi16(diff, 2); + + diff1 = _mm256_subs_epi16(temp1, temp4); //(p1 -q1) + diff = _mm256_add_epi16(diff, diff1); + + diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4)); + in_macro = _mm256_srai_epi16(diff, 3); + + + C0_uv_8x32 = _mm256_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], + pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], + pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], + pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], + pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], + pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], + pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], + pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); + + C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1)); + + in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3 + C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32); + in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro); + + p0_uv_8x32_1 = _mm256_add_epi16(temp2, in_macro); + q0_uv_8x32_1 = _mm256_sub_epi16(temp3, in_macro); + + + flag1 = _mm256_and_si256(flag1, flag_bs); + flag1 = _mm256_packs_epi16(flag1, flag1); // 0213 + + pq0_uv_8x32 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1); //0213 + + pq0_uv_8x32_1 = _mm256_and_si256(pq0_uv_32x8, + _mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF))); + pq0_uv_8x32_2 = _mm256_and_si256(pq0_uv_8x32, flag1); + pq0_uv_32x8 = _mm256_add_epi8(pq0_uv_8x32_1, pq0_uv_8x32_2); + + + t1 = _mm256_unpacklo_epi16(pq1_uv_32x8, pq0_uv_32x8); // temp1 temp3 + t2 = _mm256_unpackhi_epi16(pq1_uv_32x8, pq0_uv_32x8); // temp2 temp4 + + t4 = _mm256_shufflelo_epi16(t2, _MM_SHUFFLE(2, 3, 0, 1)); // pshuflw + t4 = _mm256_shufflehi_epi16(t4, _MM_SHUFFLE(2, 3, 0, 1)); + + lineae = _mm256_unpacklo_epi32(t1, t4); // temp1 temp3 + linecg = _mm256_unpackhi_epi32(t1, t4); // temp2 temp4 + + linea = _mm256_castsi256_si128(lineae); + lineb = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8)); + lineae = _mm256_permute2f128_si256(lineae, lineae, 0x1); + linee = _mm256_castsi256_si128(lineae); + linef = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8)); + + + linec = _mm256_castsi256_si128(linecg); + lined = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8)); + linecg = _mm256_permute2f128_si256(linecg, linecg, 0x1); + lineg = _mm256_castsi256_si128(linecg); + lineh = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8)); + + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); + _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); + +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_chroma_horz_bslt4_avx2() */ +/* */ +/* Description : This function performs filtering of a chroma block */ +/* horizontal edge when the boundary strength is less than */ +/* 4 in high profile. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 of U */ +/* src_strd - source stride */ +/* alpha_cb - alpha value for the boundary in U */ +/* beta_cb - beta value for the boundary in U */ +/* alpha_cr - alpha value for the boundary in V */ +/* beta_cr - beta value for the boundary in V */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab_cb - tc0_table for U */ +/* pu1_cliptab_cr - tc0_table for V */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264 with alpha and beta values different */ +/* in U and V. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* 12 10 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ +/*****************************************************************************/ +void ih264_deblk_chroma_horz_bslt4_avx2 (UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha_cb, + WORD32 beta_cb, + WORD32 alpha_cr, + WORD32 beta_cr, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab_cb, + const UWORD8 *pu1_cliptab_cr) +{ + UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ + WORD16 i16_posP1, i16_posP0, i16_posQ1; + UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; + + UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ + WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; + WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; + __m256i p0q0_uv_32x8,p1q1_uv_32x8; + __m256i temp1,temp2,temp3,temp4; + __m256i flag_bs, flag1, flag2; + __m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro; + __m256i zero = _mm256_setzero_si256(); + __m256i C0_uv_8x32; + __m256i p0q0_uv_8x32_1, p0q0_uv_8x32_2,res1,res2,p0_uv_8x32_1,q0_uv_8x32_1; + + pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); + + i16_posQ1 = src_strd; + i16_posP0 = src_strd; + i16_posP1 = 0; + + u1_Bs0 = (u4_bs >> 24) & 0xff; + u1_Bs1 = (u4_bs >> 16) & 0xff; + u1_Bs2 = (u4_bs >> 8) & 0xff; + u1_Bs3 = (u4_bs >> 0) & 0xff; + + flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, + u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, + u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, + u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2, + u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, + u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0, + u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, + u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); + flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s + flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask + + p0q0_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv), (__m128i *)(pu1_HorzPixelUV + i16_posP0)); + p1q1_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv + i16_posQ1), (__m128i *)(pu1_HorzPixelUV + i16_posP1)); + + res1 = _mm256_permute4x64_epi64(p0q0_uv_32x8,0xD8); + res2 = _mm256_permute4x64_epi64(p1q1_uv_32x8,0xD8); + + temp3 = _mm256_unpacklo_epi8(res1, zero); //p0 l 0 h 0 + temp4 = _mm256_unpackhi_epi8(res1, zero); //q0 + temp1 = _mm256_unpacklo_epi8(res2, zero); //p1 + temp2 = _mm256_unpackhi_epi8(res2, zero); //q1 + + diff = _mm256_subs_epi16(temp3, temp4); //Condn 1 //p0 l h - q0 l h + diff = _mm256_abs_epi16(diff); + alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr); + flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff); + + diff = _mm256_subs_epi16(temp2, temp4); //Condtn 2 + diff = _mm256_abs_epi16(diff); + beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr); + flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff)); + + diff = _mm256_subs_epi16(temp1, temp3); //Condtn 3 + diff = _mm256_abs_epi16(diff); + flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff)); + + diff = _mm256_subs_epi16(temp4, temp3); + diff = _mm256_slli_epi16(diff, 2); + diff1 = _mm256_subs_epi16(temp1, temp2); + diff = _mm256_add_epi16(diff, diff1); + diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4)); + in_macro = _mm256_srai_epi16(diff, 3); + + C0_uv_8x32 = _mm256_set_epi16( + pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], + pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], + pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], + pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], + pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], + pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], + pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], + pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); + + C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1)); + + in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3 + C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32); + in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro); + + p0_uv_8x32_1 = _mm256_add_epi16(temp3, in_macro); + q0_uv_8x32_1 = _mm256_sub_epi16(temp4, in_macro); + + p0q0_uv_8x32_2 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1); + flag1 = _mm256_packs_epi16(flag1, flag1); + flag1 = _mm256_and_si256(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) + + p0q0_uv_8x32_1 = _mm256_and_si256(res1, + _mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF))); + p0q0_uv_8x32_2 = _mm256_and_si256(p0q0_uv_8x32_2, flag1); + p0q0_uv_8x32_1 = _mm256_add_epi8(p0q0_uv_8x32_1, p0q0_uv_8x32_2); + p0q0_uv_8x32_1 = _mm256_permute4x64_epi64(p0q0_uv_8x32_1,0xD8); + + _mm256_storeu2_m128i((__m128i *)(pu1_src_uv),(__m128i *)(pu1_HorzPixelUV + i16_posP0), p0q0_uv_8x32_1); + +} diff --git a/common/x86/ih264_deblk_luma_avx2.c b/common/x86/ih264_deblk_luma_avx2.c new file mode 100644 index 0000000..d585dea --- /dev/null +++ b/common/x86/ih264_deblk_luma_avx2.c @@ -0,0 +1,275 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : ih264_deblk_luma_avx2.c */ +/* */ +/* Description : Contains function definitions for deblocking */ +/* */ +/* List of Functions : ih264_deblk_luma_horz_bslt4_avx2() */ +/* ih264_deblk_luma_vert_bslt4_avx2() */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Added luma deblocking ssse3 */ +/* intrinsics */ +/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include +#ifdef __ANDROID__ +#include "log/log.h" +#include +#endif + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_platform_macros.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_macros.h" + + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_deblk_luma_horz_bslt4_avx2() */ +/* */ +/* Description : This function performs filtering of a luma block */ +/* horizontal edge when boundary strength is less than 4. */ +/* */ +/* Inputs : pu1_src - pointer to the src sample q0 */ +/* src_strd - source stride */ +/* alpha - alpha value for the boundary */ +/* beta - beta value for the boundary */ +/* u4_bs - packed Boundary strength array */ +/* pu1_cliptab - tc0_table */ +/* */ +/* Globals : None */ +/* */ +/* Processing : This operation is described in Sec. 8.7.2.3 under the */ +/* title "Filtering process for edges for bS less than 4" */ +/* in ITU T Rec H.264. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 12 02 2015 Naveen Kumar P Initial version */ +/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ +/*****************************************************************************/ +void ih264_deblk_luma_horz_bslt4_avx2(UWORD8 *pu1_src, + WORD32 src_strd, + WORD32 alpha, + WORD32 beta, + UWORD32 u4_bs, + const UWORD8 *pu1_cliptab) +{ + + WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2; + UWORD8 *pu1_HorzPixel; + __m256i zero = _mm256_setzero_si256(); + __m128i zero_128 = _mm_setzero_si128(); + __m128i Alpha_8x16,bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16; + __m256i Beta_8x32,in_macro_32x8,in_macro_1,in_macro_2,flag1_32x8,flag2_32x8; + __m256i C_8x32,C0_8x32_res,temp1,temp2,temp3,temp4,res1,res2,q0p1_32x8,p0q1_32x8; + __m128i p0_16x8,q0_16x8,temp1_128,temp2_128,flag1_16x8_128; + __m256i const_val4_8x32,p0q0_32x8,p1q1_32x8,p2q2_32x8,q0p0_32x8; + UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; + UWORD8 clip0, clip1, clip2, clip3; + + pu1_HorzPixel = pu1_src - (src_strd << 2); + + i16_posQ1 = src_strd; + i16_posQ2 = X2(src_strd); + i16_posP0 = X3(src_strd); + i16_posP1 = X2(src_strd); + i16_posP2 = src_strd; + + p0q0_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src), (__m128i *)(pu1_HorzPixel + i16_posP0)); //lower -p0 higher-q0 + p1q1_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ1), (__m128i *)(pu1_HorzPixel + i16_posP1)); //l= p1, h=q1 + p2q2_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ2), (__m128i *)(pu1_HorzPixel + i16_posP2)); + + u1_Bs0 = (u4_bs >> 24) & 0xff; + u1_Bs1 = (u4_bs >> 16) & 0xff; + u1_Bs2 = (u4_bs >> 8) & 0xff; + u1_Bs3 = (u4_bs >> 0) & 0xff; + clip0 = pu1_cliptab[u1_Bs0]; + clip1 = pu1_cliptab[u1_Bs1]; + clip2 = pu1_cliptab[u1_Bs2]; + clip3 = pu1_cliptab[u1_Bs3]; + + Alpha_8x16 = _mm_set1_epi16(alpha); + Beta_8x32 = _mm256_set1_epi16(beta); + + bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, + u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, + u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, + u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); + + C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2, + clip2, clip1, clip1, clip1, clip1, clip0, clip0, + clip0, clip0); + + bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero_128); + bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask + C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero_128); + C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero_128); + C0_8x32_res = _mm256_set_m128i(C0_hi_8x16,C0_8x16); + + //Cond1 (ABS(p0 - q0) < alpha) + p0_16x8 = _mm256_castsi256_si128(p0q0_32x8); + q0p0_32x8 = _mm256_permute2x128_si256(p0q0_32x8, p0q0_32x8, 0x1); + q0_16x8 = _mm256_castsi256_si128(p0q0_32x8); + temp1_128 = _mm_subs_epu8(q0_16x8, p0_16x8); + temp2_128 = _mm_subs_epu8(p0_16x8, q0_16x8); + temp1_128 = _mm_add_epi8(temp1_128, temp2_128); + + temp2_128 = _mm_unpacklo_epi8(temp1_128, zero_128); + temp1_128 = _mm_unpackhi_epi8(temp1_128, zero_128); + + temp2_128 = _mm_cmpgt_epi16(Alpha_8x16, temp2_128); + temp1_128 = _mm_cmpgt_epi16(Alpha_8x16, temp1_128); + flag1_16x8_128 = _mm_packs_epi16(temp2_128, temp1_128); + flag1_16x8_128 = _mm_and_si128(flag1_16x8_128, bs_flag_16x8b); + + flag1_32x8 = _mm256_set_m128i(flag1_16x8_128,flag1_16x8_128); + + //Cond2 (ABS(q1 - q0) < beta) & Cond3 (ABS(p1 - p0) < beta) + temp1 = _mm256_subs_epu8(p0q0_32x8, p1q1_32x8); + temp2 = _mm256_subs_epu8(p1q1_32x8, p0q0_32x8); + temp1 = _mm256_add_epi8(temp1, temp2); + + temp2 = _mm256_unpacklo_epi8(temp1, zero); + temp1 = _mm256_unpackhi_epi8(temp1, zero); + + temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2); + temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1); + + flag2_32x8 = _mm256_packs_epi16(temp2, temp1); + + //!((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) + flag1_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8); + + //(ABS(p2 - p0) < beta) & (ABS(q2 - q0) < beta) + temp1 = _mm256_subs_epu8(p0q0_32x8, p2q2_32x8); + temp2 = _mm256_subs_epu8(p2q2_32x8, p0q0_32x8); + temp1 = _mm256_add_epi8(temp1, temp2); + + temp2 = _mm256_unpacklo_epi8(temp1, zero); + temp1 = _mm256_unpackhi_epi8(temp1, zero); + temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2); + temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1); + + flag2_32x8 = _mm256_packs_epi16(temp2, temp1); + flag2_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8); + + temp2 = _mm256_subs_epi16(zero, temp2); + temp1 = _mm256_subs_epi16(zero, temp1); + + temp3 = _mm256_permute2x128_si256(temp2,temp1,0x20); // low adding + temp4 = _mm256_permute2x128_si256(temp2,temp1,0x31); //high adding + temp2 = _mm256_add_epi16(temp3,temp4); + C_8x32 = _mm256_add_epi16(C0_8x32_res, temp2); // + const_val4_8x32 = _mm256_set1_epi16(4); + + res1 = _mm256_permute4x64_epi64(q0p0_32x8, 0xD8); + res2 = _mm256_permute4x64_epi64(p1q1_32x8, 0xD8); + + temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res1, zero), + _mm256_unpackhi_epi8(res1, zero)); + temp4 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res2, zero), + _mm256_unpackhi_epi8(res2, zero)); + + temp1 = _mm256_slli_epi16(temp3, 2); + temp1 = _mm256_add_epi16(temp1, temp4); + temp1 = _mm256_add_epi16(temp1, const_val4_8x32); + in_macro_32x8 = _mm256_srai_epi16(temp1, 3); + + in_macro_32x8 = _mm256_min_epi16(C_8x32, in_macro_32x8); //CLIP3 + C_8x32 = _mm256_subs_epi16(zero, C_8x32); + in_macro_32x8 = _mm256_max_epi16(C_8x32, in_macro_32x8); //CLIP3 + + temp3 = _mm256_unpacklo_epi8(res1, zero); //q0 + temp4 = _mm256_unpackhi_epi8(res1, zero); //p0 + + temp1 = _mm256_add_epi16(temp4, in_macro_32x8); + temp2 = _mm256_sub_epi16(temp3, in_macro_32x8); + + temp1 = _mm256_packus_epi16(temp2, temp1); // Suffle needed + + temp1 = _mm256_and_si256(temp1, flag1_32x8); //q0 p0 + + temp2 = _mm256_and_si256(res1, + _mm256_xor_si256(flag1_32x8, _mm256_set1_epi16(0xFFFF))); + + temp1 = _mm256_add_epi8(temp1, temp2); + temp1 = _mm256_permute4x64_epi64(temp1, 0xD8); + _mm256_storeu2_m128i((__m128i *)(pu1_HorzPixel + i16_posP0),(__m128i *)(pu1_src),temp1); + + //if(Ap < Beta) if(Aq < Beta) + temp1 = _mm256_avg_epu16(_mm256_unpacklo_epi8(res1, zero), + _mm256_unpackhi_epi8(res1, zero)); + + temp2 = _mm256_slli_epi16(_mm256_unpacklo_epi8(p1q1_32x8, zero), 1); + temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(p2q2_32x8, zero), temp2); + + temp2 = _mm256_slli_epi16(_mm256_unpackhi_epi8(p1q1_32x8, zero), 1); + temp2 = _mm256_subs_epi16(_mm256_unpackhi_epi8(p2q2_32x8, zero), temp2); + + temp4 = _mm256_permute2x128_si256(temp3, temp2, 0x20); //p0 q0 + temp3 = _mm256_permute2x128_si256(temp3, temp2, 0x31); + temp4 = _mm256_add_epi16(temp1, temp4); //p + in_macro_1 = _mm256_srai_epi16(temp4, 1); + temp3 = _mm256_add_epi16(temp1, temp3); //q + in_macro_2 = _mm256_srai_epi16(temp3, 1); + + in_macro_1 = _mm256_min_epi16(C0_8x32_res, in_macro_1); //CLIP3 + C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res); + in_macro_1 = _mm256_max_epi16(C0_8x32_res, in_macro_1); //CLIP3 + + in_macro_2 = _mm256_max_epi16(C0_8x32_res, in_macro_2); //CLIP3 + C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res); + in_macro_2 = _mm256_min_epi16(C0_8x32_res, in_macro_2); //CLIP3 + + temp1 = _mm256_unpacklo_epi8(res2, zero); + temp2 = _mm256_unpackhi_epi8(res2, zero); + + temp1 = _mm256_add_epi16(temp1, in_macro_1); + temp2 = _mm256_add_epi16(temp2, in_macro_2); + temp1 = _mm256_packus_epi16(temp1, temp2); // pl ph ql qh + temp1 = _mm256_and_si256(temp1, flag2_32x8); + temp2 = _mm256_and_si256(res2,_mm256_xor_si256(flag2_32x8, _mm256_set1_epi16(0xFFFF))); + temp1 = _mm256_add_epi8(temp1, temp2); + temp1 = _mm256_permute4x64_epi64(temp1, 0xD8); + _mm256_storeu2_m128i((__m128i *)(pu1_src + i16_posQ1),(__m128i *)(pu1_HorzPixel + i16_posP1),temp1); + +} diff --git a/common/x86/ih264_ihadamard_scaling_avx2.c b/common/x86/ih264_ihadamard_scaling_avx2.c new file mode 100644 index 0000000..40f2dfa --- /dev/null +++ b/common/x86/ih264_ihadamard_scaling_avx2.c @@ -0,0 +1,184 @@ +/****************************************************************************** ++ * ++ * Copyright (C) 2015 The Android Open Source Project ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at: ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ * ++ ***************************************************************************** ++ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore ++*/ +/** ++ ******************************************************************************* ++ * @file ++ * ih264_ihadamard_scaling_avx2.c ++ * ++ * @brief ++ * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling ++ * ++ * @author ++ * Priyanka ++ * ++ * @par List of Functions: ++ * - ih264_ihadamard_scaling_4x4_avx2() ++ * ++ * @remarks ++ * ++ ******************************************************************************* ++ */ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_trans_macros.h" +#include "ih264_macros.h" +#include "ih264_trans_data.h" +#include "ih264_size_defs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include + +/* ++ ******************************************************************************** ++ * ++ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients ++ * of a 16x16 intra prediction macroblock, and then performs scaling. ++ * prediction buffer ++ * ++ * @par Description: ++ * The DC coefficients pass through a 2-stage inverse hadamard transform. ++ * This inverse transformed content is scaled to based on Qp value. ++ * ++ * @param[in] pi2_src ++ * input 4x4 block of DC coefficients ++ * ++ * @param[out] pi2_out ++ * output 4x4 block ++ * ++ * @param[in] pu2_iscal_mat ++ * pointer to scaling list ++ * ++ * @param[in] pu2_weigh_mat ++ * pointer to weight matrix ++ * ++ * @param[in] u4_qp_div_6 ++ * Floor (qp/6) ++ * ++ * @param[in] pi4_tmp ++ * temporary buffer of size 1*16 ++ * ++ * @returns none ++ * ++ * @remarks none ++ * ++ ******************************************************************************* +*/ + +#include +#include + +#include + +#ifdef __ANDROID__ +#include "log/log.h" +#include +#endif + + +void ih264_ihadamard_scaling_4x4_avx2(WORD16* pi2_src, + WORD16* pi2_out, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD32* pi4_tmp) +{ + __m256i src,r0_r1,r2_r3,r3_r2,r1_r3,r0_r2; + __m256i src_r0_r1, src_r2_r3; + __m256i temp0, temp1,tmp0, tmp1, tmp2, tmp3; + __m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0); + __m256i mult_val = _mm256_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]); + __m256i zero = _mm256_setzero_si256(); + + __m128i t0 ,t1; + UNUSED (pi4_tmp); + + src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row + + temp0 = _mm256_unpacklo_epi64(src_r0_r1, zero); + temp1 = _mm256_unpackhi_epi64(src_r0_r1, zero); // b0 b1 b2.. d0 d1... + temp0 = _mm256_unpacklo_epi16(temp0, temp1); + tmp0 = _mm256_permute2x128_si256(temp0,zero,0x20); //tmp0 tmp3 + tmp1 = _mm256_permute2x128_si256(temp0,zero,0x31); //tmp1 tmp2 + + temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1 + temp1 = _mm256_unpackhi_epi32(tmp0, tmp1); + + temp1 = _mm256_shuffle_epi32(temp1,0b01001110); + tmp0 = _mm256_add_epi16(temp0, temp1); + tmp1 = _mm256_sub_epi16(temp0, temp1); + + temp0 = _mm256_unpacklo_epi64(tmp0, tmp1); + temp1 = _mm256_unpackhi_epi64(tmp0, tmp1); + tmp0 = _mm256_add_epi16(temp0, temp1); + tmp1 = _mm256_sub_epi16(temp0, temp1); + + temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1 + temp1 = _mm256_unpackhi_epi32(tmp0, tmp1); + + temp0 = _mm256_unpacklo_epi64(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1 + temp1 = _mm256_unpackhi_epi64(tmp0, tmp1); + + tmp0 = _mm256_unpacklo_epi16(temp0, temp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1 + tmp1 = _mm256_unpackhi_epi16(temp0, temp1); + + temp0 = _mm256_unpacklo_epi32(tmp0, tmp1); //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1 + temp1 = _mm256_unpackhi_epi32(tmp0, tmp1); + + temp1 = _mm256_shuffle_epi32(temp1, _MM_SHUFFLE(1, 0, 3, 2)); + tmp0 = _mm256_add_epi16(temp0, temp1); + tmp1 = _mm256_sub_epi16(temp0, temp1); + temp0 = _mm256_unpacklo_epi64(tmp0, tmp1); + temp1 = _mm256_unpackhi_epi64(tmp0, tmp1); + tmp0 = _mm256_add_epi16(temp0, temp1); + tmp1 = _mm256_sub_epi16(temp0, temp1); + + temp0 = _mm256_unpacklo_epi64(tmp0, tmp1); + temp1 = _mm256_unpackhi_epi64(tmp0, tmp1); + + + r0_r1 =_mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp0)); + r2_r3 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp1)); + + src_r0_r1 = _mm256_mullo_epi32(r0_r1, mult_val); + src_r2_r3 = _mm256_mullo_epi32(r2_r3, mult_val); + + //Scaling + if(u4_qp_div_6 >= 6) + { + src_r0_r1 = _mm256_slli_epi32(src_r0_r1, u4_qp_div_6 - 6); + src_r2_r3 = _mm256_slli_epi32(src_r2_r3, u4_qp_div_6 - 6); + } + else + { + temp0 = _mm256_add_epi32(src_r0_r1, add_rshift); + temp1 = _mm256_add_epi32(src_r2_r3, add_rshift); + src_r0_r1 = _mm256_srai_epi32(temp0, 6 - u4_qp_div_6); + src_r2_r3 = _mm256_srai_epi32(temp1, 6 - u4_qp_div_6); + } + + src = _mm256_packs_epi32(src_r0_r1, src_r2_r3); + _mm256_storeu_si256((__m256i *) (&pi2_out[0]), src); +} + diff --git a/common/x86/ih264_inter_pred_filters_avx2.c b/common/x86/ih264_inter_pred_filters_avx2.c new file mode 100644 index 0000000..dc0ad91 --- /dev/null +++ b/common/x86/ih264_inter_pred_filters_avx2.c @@ -0,0 +1,959 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/*****************************************************************************/ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +#ifdef __ANDROID__ +#include "log/log.h" +#include +#endif + +#include +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_inter_pred_filters.h" + +/*****************************************************************************/ +/* Constant Data variables */ +/*****************************************************************************/ + +/* coefficients for 6 tap filtering*/ +//const WORD32 ih264_g_six_tap[3] ={1,-5,20}; +/*****************************************************************************/ +/* Function definitions . */ +/*****************************************************************************/ +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_luma_copy_avx2 */ +/* */ +/* Description : This function copies the contents of ht x wd block from */ +/* source to destination. (ht,wd) can be (4,4), (8,4), */ +/* (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ +/*****************************************************************************/ +void ih264_inter_pred_luma_copy_avx2(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4; + UNUSED(pu1_tmp); + UNUSED(dydx); + + src_strd2 = src_strd << 1; + dst_strd2 = dst_strd << 1; + src_strd4 = src_strd << 2; + dst_strd4 = dst_strd << 2; + src_strd3 = src_strd2 + src_strd; + dst_strd3 = dst_strd2 + dst_strd; + if(wd == 4) + { + do + { + *((WORD32 *)(pu1_dst)) = *((WORD32 *)(pu1_src)); + *((WORD32 *)(pu1_dst + dst_strd)) = *((WORD32 *)(pu1_src + src_strd)); + *((WORD32 *)(pu1_dst + dst_strd2)) = *((WORD32 *)(pu1_src + src_strd2)); + *((WORD32 *)(pu1_dst + dst_strd3)) = *((WORD32 *)(pu1_src + src_strd3)); + + ht -= 4; + pu1_src += src_strd4; + pu1_dst += dst_strd4; + } + while(ht > 0); + } + else if(wd == 8) + { + __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b; + do + { + + y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2)); + y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3)); + + _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b); + + ht -= 4; + pu1_src += src_strd4; + pu1_dst += dst_strd4; + } + while(ht > 0); + } + else // wd == 16 + { + __m256i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b; + WORD32 src_strd5, src_strd6, src_strd7, src_strd8; + WORD32 dst_strd5, dst_strd6, dst_strd7, dst_strd8; + + __m256i y_4_16x8b, y_5_16x8b, y_6_16x8b, y_7_16x8b,y_0_1,y_2_3,y_4_5,y_6_7; + + + src_strd5 = src_strd2 + src_strd3; + dst_strd5 = dst_strd2 + dst_strd3; + src_strd6 = src_strd3 << 1; + dst_strd6 = dst_strd3 << 1; + src_strd7 = src_strd3 + src_strd4; + dst_strd7 = dst_strd3 + dst_strd4; + src_strd8 = src_strd << 3; + dst_strd8 = dst_strd << 3; + + do + { + + y_0_1 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd),(__m128i *)pu1_src); + y_2_3 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd3),(__m128i *)(pu1_src + src_strd2)); + y_4_5 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd5),(__m128i *)(pu1_src + src_strd4)); + y_6_7 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd7),(__m128i *)(pu1_src + src_strd6)); + + _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)pu1_dst,y_0_1); + _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd3),(__m128i *)(pu1_dst + dst_strd2),y_2_3); + _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd5),(__m128i *)(pu1_dst + dst_strd4),y_4_5); + _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd7),(__m128i *)(pu1_dst + dst_strd6),y_6_7); + + ht -= 8; + pu1_src += src_strd8; + pu1_dst += dst_strd8; + } + while(ht > 0); + } +} + + + + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_chroma_avx2 */ +/* */ +/* Description : This function implements a four-tap 2D filter as */ +/* mentioned in sec. 8.4.2.2.2 titled "Chroma sample */ +/* "interpolation process". (ht,wd) can be (2,2), (4,2), */ +/* (2,4), (4,4), (8,4), (4,8) or (8,8). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* dx - x position of destination value */ +/* dy - y position of destination value */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ +/*****************************************************************************/ +void ih264_inter_pred_chroma_avx2(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 dx, + WORD32 dy, + WORD32 ht, + WORD32 wd) +{ + + WORD32 i, j, A, B, C, D; + i = 8 - dx; + j = 8 - dy; + + A = i * j; + B = dx * j; + C = i * dy; + D = dx * dy; + if(wd == 2) + { + WORD32 tmp1, tmp2, tmp3, tmp4; + + do + { + //U + tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2]; + tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4]; + //V + tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3]; + tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5]; + + tmp1 = (tmp1 + 32) >> 6; + tmp2 = (tmp2 + 32) >> 6; + tmp3 = (tmp3 + 32) >> 6; + tmp4 = (tmp4 + 32) >> 6; + + pu1_dst[0] = CLIP_U8(tmp1); + pu1_dst[2] = CLIP_U8(tmp2); + pu1_dst[1] = CLIP_U8(tmp3); + pu1_dst[3] = CLIP_U8(tmp4); + + pu1_src += src_strd; + pu1_dst += dst_strd; + + tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2]; + tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4]; + tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3]; + tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5]; + + tmp1 = (tmp1 + 32) >> 6; + tmp2 = (tmp2 + 32) >> 6; + tmp3 = (tmp3 + 32) >> 6; + tmp4 = (tmp4 + 32) >> 6; + + pu1_dst[0] = CLIP_U8(tmp1); + pu1_dst[2] = CLIP_U8(tmp2); + pu1_dst[1] = CLIP_U8(tmp3); + pu1_dst[3] = CLIP_U8(tmp4); + + ht -= 2; + pu1_src += src_strd; + pu1_dst += dst_strd; + } + while(ht > 0); + + } + else if(wd == 4) + { + WORD32 AB, CD; + + __m256i coeffAB_32x8b, coeffCD_32x8b, round_add32_8x32b; + __m256i const_shuff_32x8b; + + __m256i src_r23_32x8b,src_r12_32x8b,res12_AB_8x32b,res12_CD_8x32b,res1_8x32b; + __m128i res1,src_r1_16x8b_128; + __m128i const_shuff_16x8b_128; + + AB = (B << 8) + A; + CD = (D << 8) + C; + + coeffAB_32x8b = _mm256_set1_epi16(AB); + coeffCD_32x8b = _mm256_set1_epi16(CD); + + round_add32_8x32b = _mm256_set1_epi16(32); + const_shuff_16x8b_128 = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806); + const_shuff_32x8b = _mm256_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806,0x03010200, 0x05030402, 0x07050604, 0x09070806); + + + src_r1_16x8b_128 = _mm_loadu_si128((__m128i *)pu1_src); + src_r1_16x8b_128 = _mm_shuffle_epi8(src_r1_16x8b_128, const_shuff_16x8b_128); + pu1_src += src_strd; + do + { + + src_r23_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd),(__m128i *)pu1_src); + src_r23_32x8b = _mm256_shuffle_epi8(src_r23_32x8b, const_shuff_32x8b); + src_r12_32x8b = _mm256_set_m128i(_mm256_castsi256_si128(src_r23_32x8b),src_r1_16x8b_128); + + res12_AB_8x32b = _mm256_maddubs_epi16(src_r12_32x8b, coeffAB_32x8b); + res12_CD_8x32b = _mm256_maddubs_epi16(src_r23_32x8b, coeffCD_32x8b); + + res1_8x32b = _mm256_add_epi16(res12_AB_8x32b, res12_CD_8x32b); + res1_8x32b = _mm256_add_epi16(res1_8x32b, round_add32_8x32b); + + res1_8x32b = _mm256_srai_epi16(res1_8x32b, 6); + + res1_8x32b = _mm256_packus_epi16(res1_8x32b,res1_8x32b); + res1_8x32b = _mm256_permute4x64_epi64(res1_8x32b, 0xD8); + res1 = _mm256_castsi256_si128(res1_8x32b); + _mm_storel_epi64((__m128i *)pu1_dst, res1); + + res1 = _mm_srli_si128(res1, 8); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res1); + + src_r1_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(src_r23_32x8b,src_r23_32x8b,0x1));; + + ht -= 2; + pu1_src += src_strd << 1; + pu1_dst += dst_strd << 1; + + } + while(ht > 0); + } + else // wd == 8 + { + + WORD32 AB, CD; + __m256i src_r1lh_32x8b, src_r2lh_32x8b; + + __m256i res_lh_AB_8x32b, res_lh_CD_8x32b,res_1lh_8x32b,res_2lh_8x32b; + __m256i res_lh_8x32b, res_l_8x32b,res_h_8x32b, res_32x8b; + + __m256i coeffAB_32x8b, coeffCD_32x8b, round_add32_8x32b; + __m256i const_shuff_32x8b; + __m256i zero = _mm256_setzero_si256(); + __m128i res_1,res_2; + + AB = (B << 8) + A; + CD = (D << 8) + C; + + coeffAB_32x8b = _mm256_set1_epi16(AB); + coeffCD_32x8b = _mm256_set1_epi16(CD); + + round_add32_8x32b = _mm256_set1_epi16(32); + + const_shuff_32x8b = _mm256_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806,0x03010200, 0x05030402, 0x07050604, 0x09070806); + + src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src); + src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b); + pu1_src += src_strd; + + do + { + //row 1 + + src_r2lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src); + src_r2lh_32x8b = _mm256_shuffle_epi8(src_r2lh_32x8b, const_shuff_32x8b); + res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffAB_32x8b); + + res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffCD_32x8b); + res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b); + res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b); + res_1lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6); + + pu1_src += src_strd; + //row 2 + src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src); + src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b); + + res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffAB_32x8b); + res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffCD_32x8b); + + res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b); + res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b); + + res_2lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6); + + res_1lh_8x32b = _mm256_packus_epi16(res_1lh_8x32b, res_2lh_8x32b); + res_1lh_8x32b = _mm256_permute4x64_epi64(res_1lh_8x32b, 0xD8); + _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)(pu1_dst),res_1lh_8x32b); + + pu1_src += src_strd; + pu1_dst += dst_strd; + pu1_dst += dst_strd; + + //row 3 + src_r2lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src); + src_r2lh_32x8b = _mm256_shuffle_epi8(src_r2lh_32x8b, const_shuff_32x8b); + + res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffAB_32x8b); + res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffCD_32x8b); + + res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b); + res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b); + + res_1lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6); + pu1_src += src_strd; + + //row 1 + src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src); + src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b); + + res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffAB_32x8b); + res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffCD_32x8b); + + res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b); + res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b); + + res_2lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6); + + res_1lh_8x32b = _mm256_packus_epi16(res_1lh_8x32b, res_2lh_8x32b); + res_1lh_8x32b = _mm256_permute4x64_epi64(res_1lh_8x32b, 0xD8); + _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)(pu1_dst),res_1lh_8x32b); + + + ht -= 4; + pu1_src += src_strd; + pu1_dst += dst_strd; + pu1_dst += dst_strd; + } + while(ht > 0); + } +} + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2 */ +/* */ +/* Description : This function implements a six-tap filter vertically and */ +/* horizontally on ht x wd block separately and averages */ +/* the two sets of values to calculate values at (1/4,1/4), */ +/* (1/4, 3/4), (3/4, 1/4) or (3/4, 3/4) as mentioned in */ +/* sec. 8.4.2.2.1 titled "Luma sample interpolation */ +/* process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8), */ +/* (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : puc_src - pointer to source */ +/* puc_dst - pointer to destination */ +/* src_strd - stride for source */ +/* dst_strd - stride for destination */ +/* ht - height of the block */ +/* wd - width of the block */ +/* pu1_tmp - pointer to temporary buffer */ +/* dydx - x and y reference offset for q-pel */ +/* calculations */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 13 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ +/*****************************************************************************/ +void ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 ht, + WORD32 wd, + UWORD8* pu1_tmp, + WORD32 dydx) +{ + WORD32 ht_temp; + UWORD8 *pu1_pred_vert,*pu1_pred_horiz; + UWORD8 *pu1_tmp1, *pu1_tmp2; + WORD32 x_offset, y_offset; + + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + __m128i const_val16_8x16b; + __m256i coeff0_1_32x8b, coeff2_3_32x8b, coeff4_5_32x8b,coeff_32x8b; + __m256i const_val16_8x32b; + __m256i zero = _mm256_setzero_si256(); + pu1_tmp1 = pu1_tmp; + + dydx &= 0xf; + ht_temp = ht; + x_offset = dydx & 0x3; + y_offset = dydx >> 2; + pu1_tmp2 = pu1_tmp1; + + pu1_pred_vert = pu1_src + (x_offset >> 1) - 2*src_strd; + pu1_pred_horiz = pu1_src + (y_offset >> 1) * src_strd - 2; + //the filter input starts from x[-2] (till x[3]) + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + const_val16_8x16b = _mm_set1_epi16(16); + coeff_32x8b = _mm256_set_m128i(coeff2_3_16x8b,coeff0_1_16x8b); + coeff0_1_32x8b = _mm256_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_32x8b = _mm256_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_32x8b = _mm256_set1_epi32(0x01FB01FB); //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + const_val16_8x32b = _mm256_set1_epi16(16); + + if(wd == 4) + { + //vertical q-pel filter + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; + __m128i src_r5_16x8b, src_r6_16x8b; + __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; + + __m128i res_r0r1_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; + + //epilogue: Load all the pred rows except sixth and seventh row for the + //first and second row processing. + src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + + src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b); + + src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b); + + src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b); + + src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b); + + //Core Loop: Process all the rows. + do + { + src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b); + + src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd)); + src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + res_r0r1_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + + _mm_storel_epi64((__m128i *)pu1_tmp1, res_r0r1_16x8b); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht_temp -= 2; + pu1_pred_vert += src_strd << 1; + pu1_tmp1 += 8; + } + while(ht_temp > 0); + } + + //horizontal q-pel filter + { + __m128i res_r0r1_16x8b_128, src_r0r1_vpel_16x8b,res_16x8b; + __m256i src_r0r1_sht_32x8b, src_r0r1_32x8b,src_r0r1_t1_32x8b; + + __m256i res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b; + __m256i res_r0r1_32x8b; + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)pu1_tmp2); + src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + src_strd), (__m128i *)pu1_pred_horiz); + + src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1); + src_r0r1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); + + src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero); //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4 + res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + + src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 4); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 + //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 + + src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero); //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6 + res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3 + + src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 4); //a4 a5 a5 a6 a6 a7 a7 a8 0 0 0 0 0 0 0 0 + //b4 b5 b5 b6 b6 b7 b7 b8 0 0 0 0 0 0 0 0 + + src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero); //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8 + res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5 + + res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b); + res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b); + res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 15; + //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 15; + //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 15; + //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 15; + //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 15; + //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 15; + //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 15; + //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 15; + res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5); //shifting right by 5 bits. + + res_r0r1_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(res_r0r1_t1_8x32b, + res_r0r1_t1_8x32b)); + + res_16x8b = _mm_avg_epu8(res_r0r1_16x8b_128,src_r0r1_vpel_16x8b); + + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b); + res_16x8b = _mm_srli_si128(res_16x8b, 4); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b); + + ht -= 2; + pu1_pred_horiz += src_strd << 1; + pu1_tmp2 += 8; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + } + else if(wd == 8) + { + //vertical q-pel filter + { + __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b; + __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b; + __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; + + __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; + + //epilogue: Load all the pred rows except sixth and seventh row for the + //first and second row processing. + src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + + src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b); + + src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b); + + src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b); + + src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + pu1_pred_vert = pu1_pred_vert + src_strd; + src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b); + + //Core Loop: Process all the rows. + do + { + src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert)); + src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b); + + src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd)); + src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + + _mm_storel_epi64((__m128i *)(pu1_tmp1), res_16x8b); + + src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b); + + _mm_storel_epi64((__m128i *)(pu1_tmp1 + 8), res_16x8b); + + src_r0_16x8b = src_r2_16x8b; + src_r1_16x8b = src_r3_16x8b; + src_r2_16x8b = src_r4_16x8b; + src_r3_16x8b = src_r5_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht_temp -= 2; + pu1_pred_vert += src_strd << 1; + pu1_tmp1 += 16; + } + while(ht_temp > 0); + } + + //horizontal q-pel filter + { + + __m256i src_r0r1_32x8b, src_r0r1_sht_32x8b,src_r0r1_t1_32x8b,res_32x8b; + __m128i src_r0r1_vpel_128,res_16x8b_128; + __m256i src_r0r1_vpel_32x8b,res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b; + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + + do + { + src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + src_strd), //a2 a3 a4 a5 a6 a7 a8....a15 0 or + (__m128i *)pu1_pred_horiz); //a3 a4 a5 a6 a7 a8 a9....a15 0 + //b2 b3 b4 b5 b6 b7 b8....b15 0 or + //b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0r1_vpel_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_tmp2 + 8), + (__m128i *)(pu1_tmp2)); + src_r0r1_vpel_32x8b = _mm256_unpacklo_epi64(src_r0r1_vpel_32x8b,zero); + + src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + + res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + + src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + + src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + + res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b); + res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b); + res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b); + res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5); //shifting right by 5 bits. + + res_32x8b = _mm256_packus_epi16(res_r0r1_t1_8x32b, res_r0r1_t1_8x32b); + + res_32x8b = _mm256_avg_epu8(res_32x8b,src_r0r1_vpel_32x8b); + + _mm_storel_epi64((__m128i *)(pu1_dst), _mm256_castsi256_si128(res_32x8b)); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), _mm256_castsi256_si128(_mm256_permute2x128_si256(res_32x8b,res_32x8b,0x1))); + + ht -= 2; + pu1_pred_horiz += src_strd << 1; + pu1_dst += dst_strd << 1; + pu1_tmp2 += 16; + } + while(ht > 0); + } + } + else // wd == 16 + { + //vertical q-pel filter + { + __m256i src_r0r2_32x8b, src_r1r3_32x8b, src_r2r4_32x8b,src_32x8b; + __m128i src_r2_16x8b,src_r4_16x8b,src_r5_16x8b,src_r6_16x8b,src_r4r5_16x8b; + + __m256i res_t1_8x32b; + __m128i res_t0_8x16b,res_t3_8x16b,res_t1_8x16b,res_16x8b; + + //epilogue: Load all the pred rows except sixth and seventh row for the + //first and second row processing. + src_r0r2_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_vert + 2 * src_strd ), + (__m128i *)(pu1_pred_vert)); + + src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + 2 * src_strd)); + src_r1r3_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_vert + 3 * src_strd ), + (__m128i *)(pu1_pred_vert + src_strd)); + src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + 4 * src_strd)); + pu1_pred_vert = pu1_pred_vert + (5 * src_strd ) ; + + + //Core Loop: Process all the rows. + do + { + src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert)); + src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + src_strd)); + src_r2r4_32x8b = _mm256_set_m128i(src_r4_16x8b,src_r2_16x8b); + + src_32x8b = _mm256_unpacklo_epi8(src_r0r2_32x8b, src_r1r3_32x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256 (res_t1_8x32b,0x1)); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits + + + src_32x8b = _mm256_unpackhi_epi8(src_r0r2_32x8b, src_r1r3_32x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b); + + res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b),_mm256_extracti128_si256 (res_t1_8x32b,0x1)); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits. + + res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); + _mm_storeu_si128((__m128i *)(pu1_tmp1), res_16x8b); + + src_32x8b = _mm256_unpacklo_epi8(src_r1r3_32x8b, src_r2r4_32x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b); + + res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256 (res_t1_8x32b,0x1)); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits + + + src_32x8b = _mm256_unpackhi_epi8(src_r1r3_32x8b, src_r2r4_32x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b); + + res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256 (res_t1_8x32b,0x1)); + res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b); + res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits + + + res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b); + _mm_storeu_si128((__m128i *)(pu1_tmp1+16), res_16x8b); + src_r0r2_32x8b = src_r2r4_32x8b; + src_r1r3_32x8b = _mm256_set_m128i(src_r5_16x8b,_mm256_extracti128_si256(src_r1r3_32x8b,0x1)); + src_r2_16x8b = src_r4_16x8b; + src_r4_16x8b = src_r6_16x8b; + + ht_temp -= 2; + pu1_pred_vert += src_strd << 1; + pu1_tmp1 += 32; + } + while(ht_temp > 0); + } + //horizontal q-pel filter + { + + __m256i src_r0r1_32x8b,src_r0r1_sht_32x8b,src_r0r1_t1_32x8b,res_32x8b; + __m128i src_vpel_16x8b,res_16x8b_128; + + __m256i res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b; + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. + + do + { + src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + 8), + (__m128i *)(pu1_pred_horiz)); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + src_vpel_16x8b = _mm_loadu_si128((__m128i *)(pu1_tmp2)); + + src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b); + res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b); + res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b); + res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5); //shifting right by 5 bits. + + + res_32x8b = _mm256_packus_epi16(res_r0r1_t1_8x32b, res_r0r1_t1_8x32b); + res_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(res_32x8b, 0xD8)); + res_16x8b_128 = _mm_avg_epu8(res_16x8b_128, src_vpel_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b_128); + + ht --; + pu1_pred_horiz += src_strd; + pu1_dst += dst_strd; + pu1_tmp2 += 16; + } + while(ht > 0); + } + } +} diff --git a/common/x86/ih264_iquant_itrans_recon_avx2.c b/common/x86/ih264_iquant_itrans_recon_avx2.c new file mode 100644 index 0000000..ad06901 --- /dev/null +++ b/common/x86/ih264_iquant_itrans_recon_avx2.c @@ -0,0 +1,303 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264_iquant_itrans_recon_avx2.c + * + * @brief + * Contains function definitions for inverse quantization, inverse + * transform and reconstruction + * + * @author + * Priyanka Bose + * + * @par List of Functions: + * - ih264_iquant_itrans_recon_4x4_avx2() + * + * @remarks + * None + * + ******************************************************************************* + */ + + +#include + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264_trans_macros.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_trans_data.h" +#include "ih264_size_defs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include + + +/* + ******************************************************************************** + * + * @brief This function reconstructs a 4x4 sub block from quantized resiude and + * prediction buffer + * + * @par Description: + * The quantized residue is first inverse quantized, then inverse transformed. + * This inverse transformed content is added to the prediction buffer to recon- + * struct the end output + * + * @param[in] pi2_src + * quantized 4x4 block + * + * @param[in] pu1_pred + * prediction 4x4 block + * + * @param[out] pu1_out + * reconstructed 4x4 block + * + * @param[in] src_strd + * quantization buffer stride + * + * @param[in] pred_strd, + * Prediction buffer stride + * + * @param[in] out_strd + * recon buffer Stride + * + * @param[in] pu2_scaling_list + * pointer to scaling list + * + * @param[in] pu2_norm_adjust + * pointer to inverse scale matrix + * + * @param[in] u4_qp_div_6 + * Floor (qp/6) + * + * @param[in] pi4_tmp + * temporary buffer of size 1*16 + * + * @returns none + * + * @remarks none + * + ******************************************************************************* + */ +void ih264_iquant_itrans_recon_4x4_avx2(WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD16 *pi2_tmp, + WORD32 iq_start_idx, + WORD16 *pi2_dc_ld_addr) + { + + UWORD32 *pu4_out = (UWORD32 *) pu1_out; + __m256i src_r0_r1, src_r2_r3; + __m256i src_r0, src_r1, src_r2, src_r3; + __m256i scalemat_r0_r1, scalemat_r2_r3; + __m128i pred_r0, pred_r1, pred_r2, pred_r3; + __m256i sign_reg, dequant_r0_r1, dequant_r2_r3; + __m256i zero_8x32b = _mm256_setzero_si256(); // all bits reset to zero + __m256i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + __m256i resq_r0, resq_r1, resq_r2, resq_r3; + __m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0); + __m256i value_32 = _mm256_set1_epi32(32); + __m128i value_32_128 = _mm_set1_epi32(32); + __m128i r0,r1,r2,r3,t0,t1,t2,t3,t4,t5,t6,t7,sign_reg_128, de_r0_r1,de_r2_r3,r0_r1,r2_r3,scale_r0_r1,scale_r2_r3; + __m128i zero_8x16b_128 = _mm_setzero_si128(); + UNUSED (pi2_tmp); + + /*************************************************************/ + /* Dequantization of coefficients. Will be replaced by SIMD */ + /* operations on platform */ + /*************************************************************/ + src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row + scalemat_r0_r1 = _mm256_loadu_si256((__m256i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row + dequant_r0_r1 = _mm256_loadu_si256((__m256i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits + + temp0 = _mm256_mullo_epi16(scalemat_r0_r1, dequant_r0_r1); + temp4 = _mm256_unpacklo_epi16(temp0, zero_8x32b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long + temp5 = _mm256_unpackhi_epi16(temp0, zero_8x32b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long + + src_r0 = _mm256_unpacklo_epi16(src_r0_r1, zero_8x32b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long + src_r1 = _mm256_unpackhi_epi16(src_r0_r1, zero_8x32b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long + + temp0 = _mm256_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long + temp1 = _mm256_madd_epi16(src_r1, temp5); + + if (u4_qp_div_6 >= 4) { + resq_r0 = _mm256_slli_epi32(temp0, u4_qp_div_6 - 4); + resq_r1 = _mm256_slli_epi32(temp1, u4_qp_div_6 - 4); + } else { + temp4 = _mm256_add_epi32(temp0, add_rshift); + temp5 = _mm256_add_epi32(temp1, add_rshift); + resq_r0 = _mm256_srai_epi32(temp0, 4 - u4_qp_div_6); + resq_r1 = _mm256_srai_epi32(temp1, 4 - u4_qp_div_6); + } + + if (iq_start_idx == 1) + resq_r0 = _mm256_insert_epi32(resq_r0,(WORD32)pi2_dc_ld_addr[0],0); + /* Perform Inverse transform */ + + /* Perform Inverse transform */ + /*-------------------------------------------------------------*/ + /* IDCT [ Horizontal transformation ] */ + /*-------------------------------------------------------------*/ + // Matrix transpose + /* + * a0 a1 a2 a3 + * b0 b1 b2 b3 + * c0 c1 c2 c3 + * d0 d1 d2 d3 + */ + + temp0 = _mm256_unpacklo_epi32(resq_r0, resq_r1); //a0 c0 a1 c1 b0 d0 b1 d1 + temp1 = _mm256_unpackhi_epi32(resq_r0, resq_r1); //a2 c2 + + resq_r0 = _mm256_permute2f128_si256(temp0, temp1, 0x20); + resq_r1 = _mm256_permute2f128_si256(temp0, temp1, 0x31); + + temp0 = _mm256_unpacklo_epi64(resq_r0, resq_r1); // w0 w2 + temp1 = _mm256_unpackhi_epi64(resq_r0, resq_r1); // w1 w3 + + resq_r0 = _mm256_permute2f128_si256(temp0, temp1, 0x20); + resq_r1 = _mm256_permute2f128_si256(temp0, temp1, 0x31); + + r0 = _mm256_extracti128_si256(resq_r0, 0x0); + r1 = _mm256_extracti128_si256(resq_r0, 0x1); + r2 = _mm256_extracti128_si256(resq_r1, 0x0); + r3 = _mm256_extracti128_si256(resq_r1, 0x1); + + //Transform starts -- horizontal transform + /*------------------------------------------------------------------*/ + /* z0 = w0 + w2 */ + t0 = _mm_add_epi32(r0, r2); + /* z1 = w0 - w2 */ + t1 = _mm_sub_epi32(r0, r2); + /* z2 = (w1 >> 1) - w3 */ + t2 = _mm_srai_epi32(r1, 1); //(w1>>1) + t2 = _mm_sub_epi32(t2, r3); //(w1>>1) - w3 + /* z3 = w1 + (w3 >> 1) */ + t3 = _mm_srai_epi32(r3, 1); //(w3>>1) + w1 + t3 = _mm_add_epi32(t3, r1); + /*----------------------------------------------------------*/ + /* x0 = z0 + z3 */ + r0 = _mm_add_epi32(t0, t3); + /* x1 = z1 + z2 */ + r1 = _mm_add_epi32(t1, t2); + /* x2 = z1 - z2 */ + r2 = _mm_sub_epi32(t1, t2); + /* x3 = z0 - z3 */ + r3 = _mm_sub_epi32(t0, t3); + + t1 = _mm_unpacklo_epi32(r0, r1); //a0 a1 b0 b1 + t3 = _mm_unpacklo_epi32(r2, r3); //a2 a3 b2 b3 + t2 = _mm_unpackhi_epi32(r0, r1); //c0 c1 d0 d1 + t4 = _mm_unpackhi_epi32(r2, r3); //c2 c3 d2 d3 + r0 = _mm_unpacklo_epi64(t1, t3); //a0 a1 a2 a3 + r1 = _mm_unpackhi_epi64(t1, t3); //b0 b1 b2 b3 + r2 = _mm_unpacklo_epi64(t2, t4); //c0 c1 c2 c3 + r3 = _mm_unpackhi_epi64(t2, t4); //d0 d1 d2 d3 + + //Transform ends -- horizontal transform + + //Load pred buffer + pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits + pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits + + pred_r0 = _mm_cvtepu8_epi32(pred_r0); //p00 p01 p02 p03 -- all 32 bits + pred_r1 = _mm_cvtepu8_epi32(pred_r1); //p10 p11 p12 p13 -- all 32 bits ///Need to look + pred_r2 = _mm_cvtepu8_epi32(pred_r2); //p20 p21 p22 p23 -- all 32 bits + pred_r3 = _mm_cvtepu8_epi32(pred_r3); //p30 p31 p32 p33 -- all 32 bits + + /*--------------------------------------------------------------*/ + /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6 */ + /* */ + /* Add the prediction and store it back to same buffer */ + /*--------------------------------------------------------------*/ + + + t0 = _mm_add_epi32(r0, r2); + /* z1j = y0j - y2j */ + t1 = _mm_sub_epi32(r0, r2); + /* z2j = (y1j>>1) - y3j */ + t2 = _mm_srai_epi32(r1, 1); //(y1j>>1) + t2 = _mm_sub_epi32(t2, r3); + /* z3j = y1j + (y3j>>1) */ + t3 = _mm_srai_epi32(r3, 1); //(y3j>>1) + t3 = _mm_add_epi32(r1, t3); + + + t4 = _mm_add_epi32(t0, t3); + t4 = _mm_add_epi32(t4, value_32_128); + t4 = _mm_srai_epi32(t4, 6); + t4 = _mm_add_epi32(t4, pred_r0); + + t5 = _mm_add_epi32(t1, t2); + t5 = _mm_add_epi32(t5, value_32_128); + t5 = _mm_srai_epi32(t5, 6); + t5 = _mm_add_epi32(t5, pred_r1); + + t6 = _mm_sub_epi32(t1, t2); + t6 = _mm_add_epi32(t6, value_32_128); + t6 = _mm_srai_epi32(t6, 6); + t6 = _mm_add_epi32(t6, pred_r2); + + t7 = _mm_sub_epi32(t0, t3); + t7 = _mm_add_epi32(t7, value_32_128); + t7 = _mm_srai_epi32(t7, 6); + t7 = _mm_add_epi32(t7, pred_r3); + + + // 32-bit to 16-bit conversion + t0 = _mm_packs_epi32(t4, t5); + t1 = _mm_packs_epi32(t6, t7); + + + //Clipping the results to 8 bits + sign_reg_128 = _mm_cmpgt_epi16(t0, zero_8x16b_128); // sign check + t0 = _mm_and_si128(t0, sign_reg_128); + sign_reg_128 = _mm_cmpgt_epi16(t1, zero_8x16b_128); + t1 = _mm_and_si128(t1, sign_reg_128); + + r0 = _mm_packus_epi16(t0, t1); + r1 = _mm_srli_si128(r0, 4); + r2 = _mm_srli_si128(r1, 4); + r3 = _mm_srli_si128(r2, 4); + + *pu4_out = _mm_cvtsi128_si32(r0); + pu1_out += out_strd; + pu4_out = (UWORD32 *) (pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(r1); + pu1_out += out_strd; + pu4_out = (UWORD32 *) (pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(r2); + pu1_out += out_strd; + pu4_out = (UWORD32 *) (pu1_out); + *(pu4_out) = _mm_cvtsi128_si32(r3); +} diff --git a/common/x86/ih264_weighted_pred_avx2.c b/common/x86/ih264_weighted_pred_avx2.c new file mode 100644 index 0000000..79ba2b1 --- /dev/null +++ b/common/x86/ih264_weighted_pred_avx2.c @@ -0,0 +1,501 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +#include +#include "ih264_typedefs.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_weighted_pred.h" +#include +#include + +#include + + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_weighted_bi_pred_luma_avx2 */ +/* */ +/* Description : This function performs the weighted biprediction as */ +/* described in sec 8.4.2.3.2 titled "Weighted sample */ +/* prediction process" for luma. The function gets two */ +/* ht x wd blocks, weights them, adds them, rounds off the */ +/* sum, offsets it, saturates it to unsigned 8-bit and */ +/* stores it in the destination block. (ht,wd) can be */ +/* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ +/* */ +/* Inputs : pu1_src1 - Pointer to source 1 */ +/* pu1_src2 - Pointer to source 2 */ +/* pu1_dst - Pointer to destination */ +/* src_strd1 - stride for source 1 */ +/* src_strd2 - stride for source 2 */ +/* dst_strd2 - stride for destination */ +/* log_wd - number of bits to be rounded off */ +/* wt1 - weight value for source 1 */ +/* wt2 - weight value for source 2 */ +/* ofst1 - offset value for source 1 */ +/* ofst2 - offset value for source 2 */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 04 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ +/*****************************************************************************/ +void ih264_weighted_bi_pred_luma_avx2(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt1, + WORD32 wt2, + WORD32 ofst1, + WORD32 ofst2, + WORD32 ht, + WORD32 wd) +{ + + __m256i wt1_8x32b, wt2_8x32b; + __m256i ofst_8x32b, round_8x32b; + __m256i zero; + zero = _mm256_set1_epi8(0); + + WORD32 ofst; + WORD32 round_val, shft; + + wt1 = (WORD16)(wt1 & 0xffff); + wt2 = (WORD16)(wt2 & 0xffff); + round_val = 1 << log_wd; + shft = log_wd + 1; + ofst1 = (WORD8)(ofst1 & 0xff); + ofst2 = (WORD8)(ofst2 & 0xff); + ofst = (ofst1 + ofst2 + 1) >> 1; + + wt1_8x32b = _mm256_set1_epi16(wt1); + wt2_8x32b = _mm256_set1_epi16(wt2); + round_8x32b = _mm256_set1_epi16(round_val); + ofst_8x32b = _mm256_set1_epi16(ofst); + + + if(wd == 4) + { + __m128i y1_2_16x8b, y1_3_16x8b; + __m128i y2_2_16x8b, y2_3_16x8b; + + __m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b,y2_1_8x32b,y2_0_8x32b; + __m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_1_16x8b_128,y1_2_16x8b_128,y1_3_16x8b_128; + + do + { + y1_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1)); + y1_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1)); + + y2_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2)); + y2_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2)); + + y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero); + y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero); + y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero); + y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero); + + y1_0_32x8b = _mm256_unpacklo_epi32(y1_02_32x8b, y1_13_32x8b); + y2_0_32x8b = _mm256_unpacklo_epi32(y2_02_32x8b, y2_13_32x8b); + y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y1_0_32x8b, 0xD8)); + y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y2_0_32x8b, 0xD8)); + + y1_0_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128); // 8 to 16 + y2_0_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128); + + y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b); + y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b); + + y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b); + + y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft); + + y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b); + + y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(y1_0_8x32b, y1_0_8x32b)); + y1_2_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 4); + y1_1_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 8); + y1_3_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 12); + + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b_128); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b_128); + *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b_128); + *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b_128); + + ht -= 4; + pu1_src1 += src_strd1 << 2; + pu1_src2 += src_strd2 << 2; + pu1_dst += dst_strd << 2; + } + while(ht > 0); + } + else if(wd == 8) + { + __m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_2_16x8b_128,y1_1_16x8b_128,y1_3_16x8b_128; + __m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b; + __m256i y1_1_8x32b,y2_0_8x32b,y2_1_8x32b; + + do + { + + y1_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1)); + y1_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1)); + + y2_02_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2)); + y2_13_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2)); + + y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero); + y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero); + y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero); + y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero); + + y1_0_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, y1_13_32x8b); + y2_0_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, y2_13_32x8b); + + y1_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y1_0_32x8b)); + y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,0x1)); + y1_1_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128); + + y2_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y2_0_32x8b)); + y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y2_0_32x8b,y2_0_32x8b,0x1)); + y2_1_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128); + + + y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b); + y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b); + y1_1_8x32b = _mm256_mullo_epi16(y1_1_8x32b, wt1_8x32b); + y2_1_8x32b = _mm256_mullo_epi16(y2_1_8x32b, wt2_8x32b); + + y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b); + y1_1_8x32b = _mm256_adds_epi16(y1_1_8x32b, y2_1_8x32b); + + y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft); + y1_1_8x32b = _mm256_srai_epi16(y1_1_8x32b, shft); + + y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b); + y1_1_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_1_8x32b); + + y1_0_32x8b = _mm256_packus_epi16(y1_0_8x32b, y1_1_8x32b); + y1_0_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b); + y1_2_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8)); + + y1_0_32x8b = _mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,1); + y1_1_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b); + y1_3_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8)); + + _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b_128); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b_128); + _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b_128); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b_128); + + ht -= 4; + pu1_src1 += src_strd1 << 2; + pu1_src2 += src_strd2 << 2; + pu1_dst += dst_strd << 2; + + } + while(ht > 0); + } + else // wd == 16 + { + __m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b; + __m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b; + + __m256i zero_32x8b,y1_0_32x8b,y2_0_32x8b; + zero_32x8b = _mm256_set1_epi8(0); + + do + { + + y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1); + y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2); + + y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b); + y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b); + + y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b,zero_32x8b); + y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b); + + y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b); + y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b); + + y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b); + y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b); + + y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b); + y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b); + + y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b); + y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b); + + y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft); + y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft); + + y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b); + y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b); + + y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b); + + _mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b); + + ht -= 2; + pu1_src1 += src_strd1 << 1; + pu1_src2 += src_strd2 << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } +} + + +/*****************************************************************************/ +/* */ +/* Function Name : ih264_weighted_bi_pred_chroma_avx2 */ +/* */ +/* Description : This function performs the weighted biprediction as */ +/* described in sec 8.4.2.3.2 titled "Weighted sample */ +/* prediction process" for chroma. The function gets two */ +/* ht x wd blocks, weights them, adds them, rounds off the */ +/* sum, offsets it, saturates it to unsigned 8-bit and */ +/* stores it in the destination block. (ht,wd) can be */ +/* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */ +/* */ +/* Inputs : pu1_src1 - Pointer to source 1 */ +/* pu1_src2 - Pointer to source 2 */ +/* pu1_dst - Pointer to destination */ +/* src_strd1 - stride for source 1 */ +/* src_strd2 - stride for source 2 */ +/* dst_strd2 - stride for destination */ +/* log_wd - number of bits to be rounded off */ +/* wt1 - weight values for u and v in source 1 */ +/* wt2 - weight values for u and v in source 2 */ +/* ofst1 - offset value for u and v in source 1 */ +/* ofst2 - offset value for u and v in source 2 */ +/* ht - height of the block */ +/* wd - width of the block */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 04 02 2015 Kaushik Initial Version */ +/* Senthoor */ +/* 15 09 2020 Priyanka Bose AVX2 Intel Intrinsics Support */ +/*****************************************************************************/ +void ih264_weighted_bi_pred_chroma_avx2(UWORD8 *pu1_src1, + UWORD8 *pu1_src2, + UWORD8 *pu1_dst, + WORD32 src_strd1, + WORD32 src_strd2, + WORD32 dst_strd, + WORD32 log_wd, + WORD32 wt1, + WORD32 wt2, + WORD32 ofst1, + WORD32 ofst2, + WORD32 ht, + WORD32 wd) +{ + + __m128i y1_0_16x8b, y1_1_16x8b; + __m128i y2_0_16x8b, y2_1_16x8b; + + __m128i wt1_8x16b, wt2_8x16b; + __m128i ofst_8x16b, round_8x16b; + + WORD32 ofst1_u, ofst2_u, ofst_u; + WORD32 ofst1_v, ofst2_v, ofst_v; + WORD32 round_val, shft, ofst_val,ofst_val_256; + + round_val = 1 << log_wd; + shft = log_wd + 1; + + ofst1_u = (WORD8)(ofst1 & 0xff); + ofst1_v = (WORD8)(ofst1 >> 8); + ofst2_u = (WORD8)(ofst2 & 0xff); + ofst2_v = (WORD8)(ofst2 >> 8); + + wt1_8x16b = _mm_set1_epi32(wt1); + wt2_8x16b = _mm_set1_epi32(wt2); + + ofst_u = (ofst1_u + ofst2_u + 1) >> 1; + ofst_v = (ofst1_v + ofst2_v + 1) >> 1; + ofst_val = (ofst_u & 0xffff) | (ofst_v << 16); + ofst_val_256 = (ofst_u & 0xffff) | (ofst_v << 16); + + round_8x16b = _mm_set1_epi16(round_val); + ofst_8x16b = _mm_set1_epi32(ofst_val); + + if(wd == 2) + { + __m128i y1_0_8x16b, y2_0_8x16b; + + do + { + y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); //Loading 64 bits from diff location + y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); + + y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); + y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); + + y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b); + y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b); + + y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); + y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); + + y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); + y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); + + y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); + y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); + + y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); + y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); + + y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b); + y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4); + + *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b); + *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b); + + ht -= 2; + pu1_src1 += src_strd1 << 1; + pu1_src2 += src_strd2 << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else if(wd == 4) + { + __m128i y1_0_8x16b, y1_1_8x16b; + __m128i y2_0_8x16b, y2_1_8x16b; + + do + { + y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); //Loading 64 bits from diff location + y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); + + y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); + y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); + + y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); + y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); + + y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); + y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); + + y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); + y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); + y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b); + y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b); + + y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); + y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b); + + y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); + y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b); + + y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); + y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft); + + y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); + y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b); + + y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b); + y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8); + + _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); + + ht -= 2; + pu1_src1 += src_strd1 << 1; + pu1_src2 += src_strd2 << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } + else // wd == 8 + { + __m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b; + __m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b; + __m256i y1_0_32x8b,y2_0_32x8b,ofst_8x32b,round_8x32b; + __m256i wt1_8x32b, wt2_8x32b; + __m256i zero_32x8b; + + wt1_8x32b = _mm256_set1_epi16(wt1); + wt2_8x32b = _mm256_set1_epi16(wt2); + round_8x32b = _mm256_set1_epi16(round_val); + ofst_8x32b = _mm256_set1_epi32(ofst_val_256); + zero_32x8b = _mm256_set1_epi8(0); + + do + { + y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1); + y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2); + y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b); + y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b); + y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b, zero_32x8b); + y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b); + y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b); + y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b); + + y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b); + y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b); + + y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b); + y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b); + + y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b); + y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b); + + y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft); + y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft); + + y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b); + y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b); + + + y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b); + _mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b); + + ht -= 2; + pu1_src1 += src_strd1 << 1; + pu1_src2 += src_strd2 << 1; + pu1_dst += dst_strd << 1; + } + while(ht > 0); + } +} diff --git a/decoder/ih264d_function_selector.h b/decoder/ih264d_function_selector.h index 22e2efe..ad40d24 100644 --- a/decoder/ih264d_function_selector.h +++ b/decoder/ih264d_function_selector.h @@ -67,5 +67,6 @@ void ih264d_init_function_ptr_sse42(dec_struct_t *ps_codec); void ih264d_init_function_ptr_a9q(dec_struct_t *ps_codec); void ih264d_init_function_ptr_av8(dec_struct_t *ps_codec); +void ih264d_init_function_ptr_avx2(dec_struct_t *ps_codec); #endif /* _IH264D_FUNCTION_SELECTOR_H_ */ diff --git a/decoder/libavcdec.cmake b/decoder/libavcdec.cmake index 1b72dd5..48a7325 100644 --- a/decoder/libavcdec.cmake +++ b/decoder/libavcdec.cmake @@ -46,7 +46,8 @@ else() list( APPEND LIBAVCDEC_SRCS "${AVC_ROOT}/decoder/x86/ih264d_function_selector.c" "${AVC_ROOT}/decoder/x86/ih264d_function_selector_sse42.c" - "${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c") + "${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c" + "${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c") endif() add_library(libavcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS} diff --git a/decoder/mvc/libmvcdec.cmake b/decoder/mvc/libmvcdec.cmake index 4eb7643..6c4639b 100644 --- a/decoder/mvc/libmvcdec.cmake +++ b/decoder/mvc/libmvcdec.cmake @@ -54,7 +54,8 @@ else() list( APPEND LIBMVCDEC_ASMS "${AVC_ROOT}/decoder/x86/ih264d_function_selector.c" "${AVC_ROOT}/decoder/x86/ih264d_function_selector_sse42.c" - "${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c") + "${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c" + "${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c") endif() add_library(libmvcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS} diff --git a/decoder/svc/libsvcdec.cmake b/decoder/svc/libsvcdec.cmake index d088426..6644353 100644 --- a/decoder/svc/libsvcdec.cmake +++ b/decoder/svc/libsvcdec.cmake @@ -95,7 +95,8 @@ else() "${AVC_ROOT}/decoder/x86/svc/isvcd_iquant_itrans_residual_sse42.c" "${AVC_ROOT}/decoder/x86/svc/isvcd_iquant_itrans_sse42.c" "${AVC_ROOT}/decoder/x86/svc/isvcd_pred_residual_recon_sse42.c" - "${AVC_ROOT}/decoder/x86/svc/isvcd_residual_resamp_sse42.c") + "${AVC_ROOT}/decoder/x86/svc/isvcd_residual_resamp_sse42.c" + "${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c") endif() add_library(libsvcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS} diff --git a/decoder/x86/ih264d_function_selector.c b/decoder/x86/ih264d_function_selector.c index 9fc5c39..8c99382 100644 --- a/decoder/x86/ih264d_function_selector.c +++ b/decoder/x86/ih264d_function_selector.c @@ -52,6 +52,7 @@ #include "ih264_error.h" #include "ih264_trans_quant_itrans_iquant.h" #include "ih264_inter_pred_filters.h" +#include "ih264_deblk_edge_filters.h" #include "ih264d_structs.h" #include "ih264d_function_selector.h" @@ -68,6 +69,11 @@ void ih264d_init_function_ptr(dec_struct_t *ps_codec) case ARCH_X86_SSSE3: ih264d_init_function_ptr_ssse3(ps_codec); break; + case ARCH_X86_AVX2: + ih264d_init_function_ptr_ssse3(ps_codec); + ih264d_init_function_ptr_sse42(ps_codec); + ih264d_init_function_ptr_avx2(ps_codec); + break; case ARCH_X86_SSE42: default: ih264d_init_function_ptr_ssse3(ps_codec); @@ -83,7 +89,7 @@ void ih264d_init_arch(dec_struct_t *ps_codec) #elif DEFAULT_ARCH == D_ARCH_X86_SSSE3 ps_codec->e_processor_arch = ARCH_X86_SSSE3; #elif DEFAULT_ARCH == D_ARCH_X86_AVX2 - ps_codec->e_processor_arch = D_ARCH_X86_AVX2; + ps_codec->e_processor_arch = ARCH_X86_AVX2; #else ps_codec->e_processor_arch = ARCH_X86_GENERIC; #endif diff --git a/decoder/x86/ih264d_function_selector_avx2.c b/decoder/x86/ih264d_function_selector_avx2.c new file mode 100644 index 0000000..3a75350 --- /dev/null +++ b/decoder/x86/ih264d_function_selector_avx2.c @@ -0,0 +1,102 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_function_selector_generic.c +* +* @brief +* Contains functions to initialize function pointers of codec context +* +* @author +* Ittiam +* +* @par List of Functions: +* - ih264e_init_function_ptr_generic +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include files */ +#include +#include +#include +#include + +/* User Include files */ +#include "ih264_typedefs.h" +#include "iv.h" +#include "ivd.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264_error.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_weighted_pred.h" +#include "ih264d_structs.h" + +#include "ih264_deblk_edge_filters.h" +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264d_init_function_ptr_avx2(dec_struct_t *ps_codec) +{ + + UNUSED(ps_codec); + ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_avx2; + ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_avx2; + ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_avx2; + //ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_avx2; + ps_codec->apf_inter_pred_luma[5] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2; + ps_codec->apf_inter_pred_luma[7] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2; + ps_codec->apf_inter_pred_luma[13] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2; + ps_codec->apf_inter_pred_luma[15] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2; + ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_avx2; + ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_avx2; + ps_codec->apf_inter_pred_luma[0] = ih264_inter_pred_luma_copy_avx2; + ps_codec->pf_iquant_itrans_recon_luma_4x4 = ih264_iquant_itrans_recon_4x4_avx2; + ps_codec->pf_weighted_bi_pred_luma = ih264_weighted_bi_pred_luma_avx2; + ps_codec->pf_weighted_bi_pred_chroma = ih264_weighted_bi_pred_chroma_avx2; + return; +}