avx2 intrinsics support for libavc(sw decoder)

This patch includes handwritten avx2 intrinsics to optimize the libavc sw decoder by reducing CPU-cycles overhead on module : libcodec2_soft_avcdec. Playing 1024 resolution video playback on the Galley App with HW decoder disabled: cpu-cycles overhead(%) reduced by ~15%. Loading of video thumbnails on Gallery/Photos App is faster (we have pushed approx more than 30 videos as a part of the usecase): cpu-cycles overhead(%) have reduced by ~10%.This patch is related to s/w video decoding. Signed-off-by: Priyanka Bose <priyanka.bose@intel.corp-partner.google.com>
2026-04-02 20:30:48 +07:00 · 2025-11-10 13:36:27 +05:30 · 2025-11-10 13:36:27 +05:30 · ca7461442c
commit ca7461442c
parent 9331596eef
19 changed files with 2817 additions and 8 deletions
--- a/Android.bp
+++ b/Android.bp
@ -236,6 +236,56 @@ cc_defaults {
    },
 }

+cc_library_static {
+    name: "libavc_avx2",
+    defaults: ["libavc_dec_defaults"],
+    visibility: ["//visibility:private"],
+    export_include_dirs: [
+        "common",
+        "decoder",
+    ],
+    arch: {
+        x86: {
+            cflags: [
+                "-mavx2",
+                "-mfma",
+            ],
+            srcs: [
+                "common/x86/ih264_ihadamard_scaling_avx2.c",
+                "common/x86/ih264_deblk_chroma_avx2.c",
+                "common/x86/ih264_deblk_luma_avx2.c",
+                "common/x86/ih264_iquant_itrans_recon_avx2.c",
+                "common/x86/ih264_weighted_pred_avx2.c",
+                "common/x86/ih264_inter_pred_filters_avx2.c",
+                "decoder/x86/ih264d_function_selector_avx2.c",
+            ],
+        },
+        x86_64: {
+            cflags: [
+                "-mavx2",
+                "-mfma",
+            ],
+            srcs: [
+                "common/x86/ih264_ihadamard_scaling_avx2.c",
+                "common/x86/ih264_deblk_chroma_avx2.c",
+                "common/x86/ih264_deblk_luma_avx2.c",
+                "common/x86/ih264_iquant_itrans_recon_avx2.c",
+                "common/x86/ih264_weighted_pred_avx2.c",
+                "common/x86/ih264_inter_pred_filters_avx2.c",
+                "decoder/x86/ih264d_function_selector_avx2.c",
+            ],
+        },
+    },
+    sanitize: {
+        blocklist: "libavc_blocklist.txt",
+    },
+    apex_available: [
+        "//apex_available:platform", // used by libstagefright_soft_avcdec
+        "com.android.media.swcodec",
+    ],
+    min_sdk_version: "29",
+}
+
 cc_library_static {
    name: "libavcdec",
    defaults: ["libavc_dec_defaults"],
@ -380,6 +430,10 @@ cc_library_static {
                "decoder/x86/ih264d_function_selector_sse42.c",
                "decoder/x86/ih264d_function_selector_ssse3.c",
            ],
+            whole_static_libs: [
+                "libavc_avx2",
+            ],
+
        },

        x86_64: {
@ -400,6 +454,9 @@ cc_library_static {
                "decoder/x86/ih264d_function_selector_sse42.c",
                "decoder/x86/ih264d_function_selector_ssse3.c",
            ],
+            whole_static_libs: [
+                "libavc_avx2",
+            ],
        },
    },

--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -8,7 +8,7 @@ function(libavc_add_compile_options)
  elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch32")
    add_compile_options(-march=armv7-a -mfpu=neon)
  else()
-    add_compile_options(-msse4.2 -mno-avx)
+    add_compile_options(-msse4.2 -mavx2 -mfma)
  endif()
  add_compile_options(-Wdeclaration-after-statement)

@ -45,8 +45,8 @@ function(libavc_add_definitions)
  elseif("${SYSTEM_PROCESSOR}" STREQUAL "aarch32")
    add_definitions(-DARMV7 -DDEFAULT_ARCH=D_ARCH_ARM_A9Q)
  else()
-    add_definitions(-DX86 -DX86_LINUX=1 -DDISABLE_AVX2
-                    -DDEFAULT_ARCH=D_ARCH_X86_SSE42)
+    add_definitions(-DX86 -DX86_LINUX=1
+	    -DDEFAULT_ARCH=D_ARCH_X86_SSE42)
  endif()
 endfunction()

--- a/common/common.cmake
+++ b/common/common.cmake
@ -108,7 +108,13 @@ else()
    "${AVC_ROOT}/common/x86/ih264_mem_fns_ssse3.c"
    "${AVC_ROOT}/common/x86/ih264_padding_ssse3.c"
    "${AVC_ROOT}/common/x86/ih264_resi_trans_quant_sse42.c"
-    "${AVC_ROOT}/common/x86/ih264_weighted_pred_sse42.c")
+    "${AVC_ROOT}/common/x86/ih264_weighted_pred_sse42.c"
+    "${AVC_ROOT}/common/x86/ih264_ihadamard_scaling_avx2.c"
+    "${AVC_ROOT}/common/x86/ih264_deblk_chroma_avx2.c"
+    "${AVC_ROOT}/common/x86/ih264_deblk_luma_avx2.c"
+    "${AVC_ROOT}/common/x86/ih264_iquant_itrans_recon_avx2.c"
+    "${AVC_ROOT}/common/x86/ih264_weighted_pred_avx2.c"
+    "${AVC_ROOT}/common/x86/ih264_inter_pred_filters_avx2.c")

  include_directories(${AVC_ROOT}/common/x86)
 endif()
--- a/common/ih264_deblk_edge_filters.h
+++ b/common/ih264_deblk_edge_filters.h
@ -159,4 +159,12 @@ ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_ssse3;
 ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_mbaff_ssse3;
 ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_mbaff_ssse3;

+
+/* AVX2 */
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_vert_bslt4_avx2;
+ih264_deblk_chroma_edge_bslt4_ft ih264_deblk_chroma_horz_bslt4_avx2;
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_horz_bslt4_avx2;
+ih264_deblk_edge_bslt4_ft ih264_deblk_luma_vert_bslt4_avx2;
+
+
 #endif /* _IH264_DEBLK_EDGE_FILTERS_H_ */
--- a/common/ih264_inter_pred_filters.h
+++ b/common/ih264_inter_pred_filters.h
@ -138,4 +138,11 @@ ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3;
 ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3;
 ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_ssse3;

+/* AVX2 Intrinsic Declarations */
+
+ih264_inter_pred_chroma_ft ih264_inter_pred_chroma_avx2;
+ih264_inter_pred_luma_ft   ih264_inter_pred_luma_copy_avx2;
+ih264_inter_pred_luma_ft ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
+
+
 #endif /* _IH264_INTER_PRED_FILTERS_H_ */
--- a/common/ih264_trans_quant_itrans_iquant.h
+++ b/common/ih264_trans_quant_itrans_iquant.h
@ -231,4 +231,10 @@ ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_sse42;
 ih264_hadamard_quant_ft ih264_hadamard_quant_4x4_sse42;
 ih264_hadamard_quant_ft ih264_hadamard_quant_2x2_uv_sse42;

+/*AVX2 Declarations*/
+ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_4x4_avx2;
+ih264_ihadamard_scaling_ft ih264_ihadamard_scaling_2x2_uv_avx2;
+ih264_iquant_itrans_recon_ft ih264_iquant_itrans_recon_4x4_avx2;
+
+
 #endif /* _IH264_TRANS_QUANT_ITRANS_IQUANT_H_ */
--- a/common/ih264_weighted_pred.h
+++ b/common/ih264_weighted_pred.h
@ -106,5 +106,10 @@ ih264_weighted_pred_ft ih264_weighted_pred_chroma_sse42;
 ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_sse42;
 ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_sse42;

+ 
+/* AVX2 */
+ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_luma_avx2;
+ih264_weighted_bi_pred_ft ih264_weighted_bi_pred_chroma_avx2;
+
 #endif /* _IH264_WEIGHTED_PRED_H_ */

--- a/common/x86/ih264_deblk_chroma_avx2.c
+++ b/common/x86/ih264_deblk_chroma_avx2.c
@ -0,0 +1,386 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+
+#ifdef __ANDROID__
+#include "log/log.h"
+#include <cutils/log.h>
+#endif
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_platform_macros.h"
+#include "ih264_deblk_edge_filters.h"
+#include "ih264_macros.h"
+
+#include <stdint.h>
+#include <string.h>
+#include <immintrin.h>
+
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : ih264_deblk_chroma_vert_bslt4_avx2()                    */
+/*                                                                           */
+/*  Description   : This function performs filtering of a chroma block       */
+/*                  vertical edge when the boundary strength is less than 4  */
+/*                  in high profile.                                         */
+/*                                                                           */
+/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
+/*                  src_strd         - source stride                         */
+/*                  alpha_cb         - alpha value for the boundary in U     */
+/*                  beta_cb          - beta value for the boundary in U      */
+/*                  alpha_cr         - alpha value for the boundary in V     */
+/*                  beta_cr          - beta value for the boundary in V      */
+/*                  u4_bs            - packed Boundary strength array        */
+/*                  pu1_cliptab_cb   - tc0_table for U                       */
+/*                  pu1_cliptab_cr   - tc0_table for V                       */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
+/*                  title "Filtering process for edges for bS less than 4"   */
+/*                  in ITU T Rec H.264 with alpha and beta values different  */
+/*                  in U and V.                                              */
+/*                                                                           */
+/*  Outputs       : None                                                     */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         12 02 2015   Naveen Kumar P  Initial version                      */
+/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
+/*****************************************************************************/
+
+void ih264_deblk_chroma_vert_bslt4_avx2(UWORD8 *pu1_src,
+                                         WORD32 src_strd,
+                                         WORD32 alpha_cb,
+                                         WORD32 beta_cb,
+                                         WORD32 alpha_cr,
+                                         WORD32 beta_cr,
+                                         UWORD32 u4_bs,
+                                         const UWORD8 *pu1_cliptab_cb,
+                                         const UWORD8 *pu1_cliptab_cr)
+{
+    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
+    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
+    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
+    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
+    __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
+    __m256i lineab, linecd, lineef, linegh, lineae, linebf, linecg, linedh;
+    __m256i temp1, temp2, temp3, temp4;
+    __m256i t1,t3, t2,t4,pq0_uv_32x8,pq1_uv_32x8,tmp1,tmp2,p0_uv_8x32,q0_uv_8x32;
+
+    __m256i pq0_uv_8x32, pq1_uv_8x32, p1_uv_8x32,pq0_uv_8x32_1,pq0_uv_8x32_2;
+    __m256i flag_bs, flag1, flag2;
+    __m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
+    __m256i zero = _mm256_setzero_si256();
+    __m256i C0_uv_8x32;
+    __m256i p0_uv_8x32_1, p0_uv_8x32_2, q0_uv_8x32_1, q0_uv_8x32_2,p0_uv_32x8_1,q0_uv_32x8_1;
+
+    u1_Bs0 = (u4_bs >> 24) & 0xff;
+    u1_Bs1 = (u4_bs >> 16) & 0xff;
+    u1_Bs2 = (u4_bs >> 8) & 0xff;
+    u1_Bs3 = (u4_bs >> 0) & 0xff;
+
+    flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
+                              u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
+                              u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
+                              u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
+    flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
+    flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask
+
+    /* Load and transpose the pixel values */
+    lineab =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + src_strd), (__m128i *)(pu1_src_uv - 4));
+    linecd =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), (__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
+    lineef =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), (__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
+    linegh =  _mm256_loadu2_m128i((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), (__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
+
+    temp1 = _mm256_unpacklo_epi64(lineab, zero);  //a0 -- a7  000.. b0..b7 000
+    temp2 = _mm256_unpacklo_epi64(linecd, zero);
+    temp3 = _mm256_unpacklo_epi64(lineef, zero);  //e0 -- e7  000.. f0..f7 000
+    temp4 = _mm256_unpacklo_epi64(linegh, zero);
+
+    temp1 = _mm256_unpacklo_epi16(temp1, temp2);  //a0 a1 c0 c1 --  a6 a7 c6 c7 b0 b1 d0 d1.. b6 b7 d6 d7
+    temp2 = _mm256_unpacklo_epi16(temp3, temp4);  //e0 e1 g0 g1                f0 f1 h0 h1
+
+    t2 = _mm256_permute2f128_si256(temp1, temp2, 0x20);
+    t3 = _mm256_permute2f128_si256(temp1, temp2, 0x31);
+
+    tmp1 = _mm256_unpacklo_epi16(t2, t3);    //a0 a1 b0 b1 c0 c1 d0 d1 -a2 a3 b2 b3 ....  e0 e1 f0 f1 g0 g1 h0 h1  -e2 e3..
+    tmp2 = _mm256_unpackhi_epi16(t2, t3);    //a4 a5 b4 b5             -a6 a7 b6 b7
+
+
+    temp1 = _mm256_unpacklo_epi8(tmp1,zero); // a0 0 a1 0 b0 0 b1 0 c0 0 c1 0 d0 0 d1 0 -  e0 0 e1 0 ..   => p1
+    temp2 = _mm256_unpackhi_epi8(tmp1,zero); // a2 0 a3 0                                                 => p0
+    temp3 = _mm256_unpacklo_epi8(tmp2,zero); //a4 0 a5 0                                                  => q0
+    temp4 = _mm256_unpackhi_epi8(tmp2,zero); //a6 0 a7 0                                                 => q1
+
+    pq1_uv_32x8 = _mm256_packus_epi16(temp1,temp4);     // 0213
+    pq0_uv_32x8 = _mm256_packus_epi16(temp2,temp3);     //0213
+
+    diff = _mm256_subs_epi16(temp2, temp3); //Condn 1    (p0 -q0) - set (3), set(3)
+    diff = _mm256_abs_epi16(diff);
+    alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
+    flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);
+
+    diff = _mm256_subs_epi16(temp4, temp3); //Condtn 2   (q1 -q0)
+    diff = _mm256_abs_epi16(diff);
+    beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
+    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
+
+
+    diff = _mm256_subs_epi16(temp1, temp2); //Condtn 3  (p1 -p0)
+    diff = _mm256_abs_epi16(diff);
+    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
+
+    diff = _mm256_subs_epi16(temp3, temp2);     //(q0 -p0)
+    diff = _mm256_slli_epi16(diff, 2);
+
+    diff1 = _mm256_subs_epi16(temp1, temp4);     //(p1 -q1)
+    diff = _mm256_add_epi16(diff, diff1);
+
+    diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
+    in_macro = _mm256_srai_epi16(diff, 3);
+
+
+    C0_uv_8x32 = _mm256_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
+                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
+                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
+                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
+                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
+                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
+                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
+                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
+
+    C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));
+
+    in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
+    C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
+    in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);
+
+    p0_uv_8x32_1 = _mm256_add_epi16(temp2, in_macro);
+    q0_uv_8x32_1 = _mm256_sub_epi16(temp3, in_macro);
+
+
+    flag1 = _mm256_and_si256(flag1, flag_bs);
+    flag1 = _mm256_packs_epi16(flag1, flag1);  // 0213
+
+    pq0_uv_8x32 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1); //0213
+
+    pq0_uv_8x32_1 = _mm256_and_si256(pq0_uv_32x8,
+                                 _mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
+    pq0_uv_8x32_2 = _mm256_and_si256(pq0_uv_8x32, flag1);
+    pq0_uv_32x8 = _mm256_add_epi8(pq0_uv_8x32_1, pq0_uv_8x32_2);
+
+
+    t1 = _mm256_unpacklo_epi16(pq1_uv_32x8, pq0_uv_32x8);   // temp1 temp3
+    t2 = _mm256_unpackhi_epi16(pq1_uv_32x8, pq0_uv_32x8);   // temp2 temp4
+
+    t4 = _mm256_shufflelo_epi16(t2, _MM_SHUFFLE(2, 3, 0, 1));  // pshuflw
+    t4 = _mm256_shufflehi_epi16(t4, _MM_SHUFFLE(2, 3, 0, 1));
+
+    lineae = _mm256_unpacklo_epi32(t1, t4);   // temp1 temp3
+    linecg = _mm256_unpackhi_epi32(t1, t4);   // temp2 temp4
+
+    linea =  _mm256_castsi256_si128(lineae);
+    lineb = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));
+    lineae =  _mm256_permute2f128_si256(lineae, lineae, 0x1);
+    linee =  _mm256_castsi256_si128(lineae);
+    linef = _mm256_castsi256_si128(_mm256_srli_si256(lineae, 8));
+
+
+    linec =  _mm256_castsi256_si128(linecg);
+    lined =  _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));
+    linecg =  _mm256_permute2f128_si256(linecg, linecg, 0x1);
+    lineg =   _mm256_castsi256_si128(linecg);
+    lineh = _mm256_castsi256_si128(_mm256_srli_si256(linecg, 8));
+
+    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
+    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
+    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
+    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
+    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
+    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
+    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
+    _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
+
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : ih264_deblk_chroma_horz_bslt4_avx2()                    */
+/*                                                                           */
+/*  Description   : This function performs filtering of a chroma block       */
+/*                  horizontal edge when the boundary strength is less than  */
+/*                  4 in high profile.                                       */
+/*                                                                           */
+/*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
+/*                  src_strd         - source stride                         */
+/*                  alpha_cb         - alpha value for the boundary in U     */
+/*                  beta_cb          - beta value for the boundary in U      */
+/*                  alpha_cr         - alpha value for the boundary in V     */
+/*                  beta_cr          - beta value for the boundary in V      */
+/*                  u4_bs            - packed Boundary strength array        */
+/*                  pu1_cliptab_cb   - tc0_table for U                       */
+/*                  pu1_cliptab_cr   - tc0_table for V                       */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
+/*                  title "Filtering process for edges for bS less than 4"   */
+/*                  in ITU T Rec H.264 with alpha and beta values different  */
+/*                  in U and V.                                              */
+/*                                                                           */
+/*  Outputs       : None                                                     */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         12 02 2015   Naveen Kumar P  Initial version                      */
+/*         12 10 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
+/*****************************************************************************/
+void ih264_deblk_chroma_horz_bslt4_avx2 (UWORD8 *pu1_src,
+                                         WORD32 src_strd,
+                                         WORD32 alpha_cb,
+                                         WORD32 beta_cb,
+                                         WORD32 alpha_cr,
+                                         WORD32 beta_cr,
+                                         UWORD32 u4_bs,
+                                         const UWORD8 *pu1_cliptab_cb,
+                                         const UWORD8 *pu1_cliptab_cr)
+{
+    UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
+    WORD16 i16_posP1, i16_posP0, i16_posQ1;
+    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
+
+    UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
+    WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
+    WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
+    __m256i p0q0_uv_32x8,p1q1_uv_32x8;
+    __m256i temp1,temp2,temp3,temp4;
+    __m256i flag_bs, flag1, flag2;
+    __m256i diff, diff1, alpha_cbcr_32x8, beta_cbcr_32x8, in_macro;
+    __m256i zero = _mm256_setzero_si256();
+    __m256i C0_uv_8x32;
+    __m256i p0q0_uv_8x32_1, p0q0_uv_8x32_2,res1,res2,p0_uv_8x32_1,q0_uv_8x32_1;
+
+    pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
+
+    i16_posQ1 = src_strd;
+    i16_posP0 = src_strd;
+    i16_posP1 = 0;
+
+    u1_Bs0 = (u4_bs >> 24) & 0xff;
+    u1_Bs1 = (u4_bs >> 16) & 0xff;
+    u1_Bs2 = (u4_bs >> 8) & 0xff;
+    u1_Bs3 = (u4_bs >> 0) & 0xff;
+
+    flag_bs = _mm256_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
+                           u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
+                           u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
+                           u1_Bs2, u1_Bs2,u1_Bs2, u1_Bs2,
+                           u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
+                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0,
+                           u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
+                           u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
+    flag_bs = _mm256_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
+    flag_bs = _mm256_xor_si256(flag_bs, _mm256_set1_epi8(0xFF)); //Invert for required mask
+
+    p0q0_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv), (__m128i *)(pu1_HorzPixelUV + i16_posP0));
+    p1q1_uv_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src_uv + i16_posQ1), (__m128i *)(pu1_HorzPixelUV + i16_posP1));
+
+    res1 = _mm256_permute4x64_epi64(p0q0_uv_32x8,0xD8);
+    res2 = _mm256_permute4x64_epi64(p1q1_uv_32x8,0xD8);
+
+    temp3 = _mm256_unpacklo_epi8(res1, zero); //p0 l 0 h 0
+    temp4 = _mm256_unpackhi_epi8(res1, zero); //q0
+    temp1 = _mm256_unpacklo_epi8(res2, zero); //p1
+    temp2 = _mm256_unpackhi_epi8(res2, zero); //q1
+
+    diff = _mm256_subs_epi16(temp3, temp4); //Condn 1 //p0 l h - q0 l h
+    diff = _mm256_abs_epi16(diff);
+    alpha_cbcr_32x8 = _mm256_set1_epi32(alpha_cbcr);
+    flag1 = _mm256_cmpgt_epi16(alpha_cbcr_32x8, diff);
+
+    diff = _mm256_subs_epi16(temp2, temp4); //Condtn 2
+    diff = _mm256_abs_epi16(diff);
+    beta_cbcr_32x8 = _mm256_set1_epi32(beta_cbcr);
+    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
+
+    diff = _mm256_subs_epi16(temp1, temp3); //Condtn 3
+    diff = _mm256_abs_epi16(diff);
+    flag1 = _mm256_and_si256(flag1, _mm256_cmpgt_epi16(beta_cbcr_32x8, diff));
+
+    diff = _mm256_subs_epi16(temp4, temp3);
+    diff = _mm256_slli_epi16(diff, 2);
+    diff1 = _mm256_subs_epi16(temp1, temp2);
+    diff = _mm256_add_epi16(diff, diff1);
+    diff = _mm256_add_epi16(diff, _mm256_set1_epi16(4));
+    in_macro = _mm256_srai_epi16(diff, 3);
+
+    C0_uv_8x32 = _mm256_set_epi16(
+                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
+                               pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
+                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
+                               pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
+                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
+                               pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
+                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
+                               pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
+
+    C0_uv_8x32 = _mm256_add_epi16(C0_uv_8x32, _mm256_set1_epi16(1));
+
+    in_macro = _mm256_min_epi16(C0_uv_8x32, in_macro); //CLIP3
+    C0_uv_8x32 = _mm256_subs_epi16(zero, C0_uv_8x32);
+    in_macro = _mm256_max_epi16(C0_uv_8x32, in_macro);
+
+    p0_uv_8x32_1 = _mm256_add_epi16(temp3, in_macro);
+    q0_uv_8x32_1 = _mm256_sub_epi16(temp4, in_macro);
+
+    p0q0_uv_8x32_2 = _mm256_packus_epi16(p0_uv_8x32_1,q0_uv_8x32_1);
+    flag1 = _mm256_packs_epi16(flag1, flag1);
+    flag1 = _mm256_and_si256(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
+
+    p0q0_uv_8x32_1 = _mm256_and_si256(res1,
+                                 _mm256_xor_si256(flag1, _mm256_set1_epi8(0xFF)));
+    p0q0_uv_8x32_2 = _mm256_and_si256(p0q0_uv_8x32_2, flag1);
+    p0q0_uv_8x32_1 = _mm256_add_epi8(p0q0_uv_8x32_1, p0q0_uv_8x32_2);
+    p0q0_uv_8x32_1 = _mm256_permute4x64_epi64(p0q0_uv_8x32_1,0xD8);
+
+    _mm256_storeu2_m128i((__m128i *)(pu1_src_uv),(__m128i *)(pu1_HorzPixelUV + i16_posP0), p0q0_uv_8x32_1);
+
+}
--- a/common/x86/ih264_deblk_luma_avx2.c
+++ b/common/x86/ih264_deblk_luma_avx2.c
@ -0,0 +1,275 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/*                                                                           */
+/*  File Name         : ih264_deblk_luma_avx2.c                              */
+/*                                                                           */
+/*  Description       : Contains function definitions for deblocking         */
+/*                                                                           */
+/*  List of Functions : ih264_deblk_luma_horz_bslt4_avx2()                   */
+/*                      ih264_deblk_luma_vert_bslt4_avx2()                   */
+/*                                                                           */
+/*  Issues / Problems : None                                                 */
+/*                                                                           */
+/*  Revision History  :                                                      */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         12 02 2015   Naveen Kumar P  Added luma deblocking ssse3          */
+/*                                      intrinsics                           */
+/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+/* System include files */
+#include <stdio.h>
+#ifdef __ANDROID__
+#include "log/log.h"
+#include <cutils/log.h>
+#endif
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_platform_macros.h"
+#include "ih264_deblk_edge_filters.h"
+#include "ih264_macros.h"
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : ih264_deblk_luma_horz_bslt4_avx2()                       */
+/*                                                                           */
+/*  Description   : This function performs filtering of a luma block         */
+/*                  horizontal edge when boundary strength is less than 4.   */
+/*                                                                           */
+/*  Inputs        : pu1_src       - pointer to the src sample q0             */
+/*                  src_strd      - source stride                            */
+/*                  alpha         - alpha value for the boundary             */
+/*                  beta          - beta value for the boundary              */
+/*                  u4_bs         - packed Boundary strength array           */
+/*                  pu1_cliptab   - tc0_table                                */
+/*                                                                           */
+/*  Globals       : None                                                     */
+/*                                                                           */
+/*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
+/*                  title "Filtering process for edges for bS less than 4"   */
+/*                  in ITU T Rec H.264.                                      */
+/*                                                                           */
+/*  Outputs       : None                                                     */
+/*                                                                           */
+/*  Returns       : None                                                     */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
+/*         12 02 2015   Naveen Kumar P  Initial version                      */
+/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
+/*****************************************************************************/
+void ih264_deblk_luma_horz_bslt4_avx2(UWORD8 *pu1_src,
+                                       WORD32 src_strd,
+                                       WORD32 alpha,
+                                       WORD32 beta,
+                                       UWORD32 u4_bs,
+                                       const UWORD8 *pu1_cliptab)
+{
+
+    WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
+    UWORD8 *pu1_HorzPixel;
+    __m256i zero = _mm256_setzero_si256();
+    __m128i zero_128 = _mm_setzero_si128();
+    __m128i Alpha_8x16,bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16;
+    __m256i Beta_8x32,in_macro_32x8,in_macro_1,in_macro_2,flag1_32x8,flag2_32x8;
+    __m256i C_8x32,C0_8x32_res,temp1,temp2,temp3,temp4,res1,res2,q0p1_32x8,p0q1_32x8;
+    __m128i p0_16x8,q0_16x8,temp1_128,temp2_128,flag1_16x8_128;
+    __m256i const_val4_8x32,p0q0_32x8,p1q1_32x8,p2q2_32x8,q0p0_32x8;
+    UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
+    UWORD8 clip0, clip1, clip2, clip3;
+
+    pu1_HorzPixel = pu1_src - (src_strd << 2);
+
+    i16_posQ1 = src_strd;
+    i16_posQ2 = X2(src_strd);
+    i16_posP0 = X3(src_strd);
+    i16_posP1 = X2(src_strd);
+    i16_posP2 = src_strd;
+
+    p0q0_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src), (__m128i *)(pu1_HorzPixel + i16_posP0)); //lower -p0 higher-q0
+    p1q1_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ1), (__m128i *)(pu1_HorzPixel + i16_posP1)); //l= p1, h=q1
+    p2q2_32x8 = _mm256_loadu2_m128i((__m128i *)(pu1_src + i16_posQ2), (__m128i *)(pu1_HorzPixel + i16_posP2));
+
+    u1_Bs0 = (u4_bs >> 24) & 0xff;
+    u1_Bs1 = (u4_bs >> 16) & 0xff;
+    u1_Bs2 = (u4_bs >> 8) & 0xff;
+    u1_Bs3 = (u4_bs >> 0) & 0xff;
+    clip0 = pu1_cliptab[u1_Bs0];
+    clip1 = pu1_cliptab[u1_Bs1];
+    clip2 = pu1_cliptab[u1_Bs2];
+    clip3 = pu1_cliptab[u1_Bs3];
+
+    Alpha_8x16 = _mm_set1_epi16(alpha);
+    Beta_8x32 = _mm256_set1_epi16(beta);
+
+    bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3,
+                                   u1_Bs2, u1_Bs2, u1_Bs2, u1_Bs2,
+				                   u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
+                                   u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
+
+    C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
+                           clip2, clip1, clip1, clip1, clip1, clip0, clip0,
+                           clip0, clip0);
+
+    bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero_128);
+    bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
+    C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero_128);
+    C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero_128);
+    C0_8x32_res = _mm256_set_m128i(C0_hi_8x16,C0_8x16);
+
+    //Cond1 (ABS(p0 - q0) < alpha)
+    p0_16x8 = _mm256_castsi256_si128(p0q0_32x8);
+    q0p0_32x8 = _mm256_permute2x128_si256(p0q0_32x8, p0q0_32x8, 0x1);
+    q0_16x8   = _mm256_castsi256_si128(p0q0_32x8);
+    temp1_128 = _mm_subs_epu8(q0_16x8, p0_16x8);
+    temp2_128 = _mm_subs_epu8(p0_16x8, q0_16x8);
+    temp1_128 = _mm_add_epi8(temp1_128, temp2_128);
+
+    temp2_128 = _mm_unpacklo_epi8(temp1_128, zero_128);
+    temp1_128 = _mm_unpackhi_epi8(temp1_128, zero_128);
+
+    temp2_128 = _mm_cmpgt_epi16(Alpha_8x16, temp2_128);
+    temp1_128 = _mm_cmpgt_epi16(Alpha_8x16, temp1_128);
+    flag1_16x8_128 = _mm_packs_epi16(temp2_128, temp1_128);
+    flag1_16x8_128 = _mm_and_si128(flag1_16x8_128, bs_flag_16x8b);
+
+    flag1_32x8  = _mm256_set_m128i(flag1_16x8_128,flag1_16x8_128);
+
+    //Cond2 (ABS(q1 - q0) < beta) & Cond3 (ABS(p1 - p0) < beta)
+    temp1 = _mm256_subs_epu8(p0q0_32x8, p1q1_32x8);
+    temp2 = _mm256_subs_epu8(p1q1_32x8, p0q0_32x8);
+    temp1 = _mm256_add_epi8(temp1, temp2);
+
+    temp2 = _mm256_unpacklo_epi8(temp1, zero);
+    temp1 = _mm256_unpackhi_epi8(temp1, zero);
+
+    temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
+    temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);
+
+    flag2_32x8 = _mm256_packs_epi16(temp2, temp1);
+
+    //!((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
+    flag1_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);
+
+   //(ABS(p2 - p0) < beta) & (ABS(q2 - q0) < beta)
+    temp1 = _mm256_subs_epu8(p0q0_32x8, p2q2_32x8);
+    temp2 = _mm256_subs_epu8(p2q2_32x8, p0q0_32x8);
+    temp1 = _mm256_add_epi8(temp1, temp2);
+
+    temp2 = _mm256_unpacklo_epi8(temp1, zero);
+    temp1 = _mm256_unpackhi_epi8(temp1, zero);
+    temp2 = _mm256_cmpgt_epi16(Beta_8x32, temp2);
+    temp1 = _mm256_cmpgt_epi16(Beta_8x32, temp1);
+
+    flag2_32x8 = _mm256_packs_epi16(temp2, temp1);
+    flag2_32x8 = _mm256_and_si256(flag1_32x8, flag2_32x8);
+
+    temp2 = _mm256_subs_epi16(zero, temp2);
+    temp1 = _mm256_subs_epi16(zero, temp1);
+
+    temp3 = _mm256_permute2x128_si256(temp2,temp1,0x20); // low adding
+    temp4 = _mm256_permute2x128_si256(temp2,temp1,0x31); //high adding
+    temp2 = _mm256_add_epi16(temp3,temp4);
+    C_8x32 = _mm256_add_epi16(C0_8x32_res, temp2);       //
+    const_val4_8x32 = _mm256_set1_epi16(4);
+
+    res1 = _mm256_permute4x64_epi64(q0p0_32x8, 0xD8);
+    res2 = _mm256_permute4x64_epi64(p1q1_32x8, 0xD8);
+
+    temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res1, zero),
+                              _mm256_unpackhi_epi8(res1, zero));
+    temp4 = _mm256_subs_epi16(_mm256_unpacklo_epi8(res2, zero),
+                              _mm256_unpackhi_epi8(res2, zero));
+
+    temp1 = _mm256_slli_epi16(temp3, 2);
+    temp1 = _mm256_add_epi16(temp1, temp4);
+    temp1 = _mm256_add_epi16(temp1, const_val4_8x32);
+    in_macro_32x8 = _mm256_srai_epi16(temp1, 3);
+
+    in_macro_32x8 = _mm256_min_epi16(C_8x32, in_macro_32x8); //CLIP3
+    C_8x32 = _mm256_subs_epi16(zero, C_8x32);
+    in_macro_32x8 = _mm256_max_epi16(C_8x32, in_macro_32x8); //CLIP3
+
+    temp3 = _mm256_unpacklo_epi8(res1, zero); //q0
+    temp4 = _mm256_unpackhi_epi8(res1, zero); //p0
+
+    temp1 = _mm256_add_epi16(temp4, in_macro_32x8);
+    temp2 = _mm256_sub_epi16(temp3, in_macro_32x8);
+
+    temp1 = _mm256_packus_epi16(temp2, temp1); // Suffle needed
+
+    temp1 = _mm256_and_si256(temp1, flag1_32x8); //q0 p0
+
+    temp2 = _mm256_and_si256(res1,
+                          _mm256_xor_si256(flag1_32x8, _mm256_set1_epi16(0xFFFF)));
+
+    temp1 = _mm256_add_epi8(temp1, temp2);
+    temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
+    _mm256_storeu2_m128i((__m128i *)(pu1_HorzPixel + i16_posP0),(__m128i *)(pu1_src),temp1);
+
+    //if(Ap < Beta) if(Aq < Beta)
+     temp1 = _mm256_avg_epu16(_mm256_unpacklo_epi8(res1, zero),
+                              _mm256_unpackhi_epi8(res1, zero));
+
+     temp2 = _mm256_slli_epi16(_mm256_unpacklo_epi8(p1q1_32x8, zero), 1);
+     temp3 = _mm256_subs_epi16(_mm256_unpacklo_epi8(p2q2_32x8, zero), temp2);
+
+     temp2 = _mm256_slli_epi16(_mm256_unpackhi_epi8(p1q1_32x8, zero), 1);
+     temp2 = _mm256_subs_epi16(_mm256_unpackhi_epi8(p2q2_32x8, zero), temp2);
+
+     temp4 = _mm256_permute2x128_si256(temp3, temp2, 0x20);     //p0 q0
+     temp3 = _mm256_permute2x128_si256(temp3, temp2, 0x31);
+     temp4 = _mm256_add_epi16(temp1, temp4); //p
+     in_macro_1  = _mm256_srai_epi16(temp4, 1);
+     temp3 = _mm256_add_epi16(temp1, temp3); //q
+     in_macro_2  = _mm256_srai_epi16(temp3, 1);
+
+     in_macro_1 = _mm256_min_epi16(C0_8x32_res, in_macro_1); //CLIP3
+     C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
+     in_macro_1 = _mm256_max_epi16(C0_8x32_res, in_macro_1); //CLIP3
+
+     in_macro_2 = _mm256_max_epi16(C0_8x32_res, in_macro_2); //CLIP3
+     C0_8x32_res = _mm256_subs_epi16(zero, C0_8x32_res);
+     in_macro_2 = _mm256_min_epi16(C0_8x32_res, in_macro_2); //CLIP3
+
+     temp1 = _mm256_unpacklo_epi8(res2, zero);
+     temp2 = _mm256_unpackhi_epi8(res2, zero);
+
+     temp1 = _mm256_add_epi16(temp1, in_macro_1);
+     temp2 = _mm256_add_epi16(temp2, in_macro_2);
+     temp1 = _mm256_packus_epi16(temp1, temp2); // pl ph ql qh
+     temp1 = _mm256_and_si256(temp1, flag2_32x8);
+     temp2 = _mm256_and_si256(res2,_mm256_xor_si256(flag2_32x8, _mm256_set1_epi16(0xFFFF)));
+     temp1 = _mm256_add_epi8(temp1, temp2);
+     temp1 = _mm256_permute4x64_epi64(temp1, 0xD8);
+     _mm256_storeu2_m128i((__m128i *)(pu1_src + i16_posQ1),(__m128i *)(pu1_HorzPixel + i16_posP1),temp1);
+
+}
--- a/common/x86/ih264_ihadamard_scaling_avx2.c
+++ b/common/x86/ih264_ihadamard_scaling_avx2.c
@ -0,0 +1,184 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ *  ih264_ihadamard_scaling_avx2.c
+ *
+ * @brief
+ *  Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling
+ *
+ * @author
+ *  Priyanka
+ *
+ *  @par List of Functions:
+ *  - ih264_ihadamard_scaling_4x4_avx2()
+ *
+ * @remarks
+ *
+ *******************************************************************************
+ */
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_trans_macros.h"
+#include "ih264_macros.h"
+#include "ih264_trans_data.h"
+#include "ih264_size_defs.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include <immintrin.h>
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
+ * of a 16x16 intra prediction macroblock, and then performs scaling.
+ * prediction buffer
+ *
+ * @par Description:
+ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
+ *  This inverse transformed content is scaled to based on Qp value.
+ *
+ * @param[in] pi2_src
+ *  input 4x4 block of DC coefficients
+ *
+ * @param[out] pi2_out
+ *  output 4x4 block
+ *
+ * @param[in] pu2_iscal_mat
+ *  pointer to scaling list
+ *
+ * @param[in] pu2_weigh_mat
+ *  pointer to weight matrix
+ *
+ * @param[in] u4_qp_div_6
+ *  Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+*/
+
+#include <stdint.h>
+#include <string.h>
+
+#include <stdio.h>
+
+#ifdef __ANDROID__
+#include "log/log.h"
+#include <cutils/log.h>
+#endif
+
+
+void ih264_ihadamard_scaling_4x4_avx2(WORD16* pi2_src,
+                                      WORD16* pi2_out,
+                                      const UWORD16 *pu2_iscal_mat,
+                                      const UWORD16 *pu2_weigh_mat,
+                                      UWORD32 u4_qp_div_6,
+                                      WORD32* pi4_tmp)
+{ 
+    __m256i src,r0_r1,r2_r3,r3_r2,r1_r3,r0_r2;
+    __m256i src_r0_r1, src_r2_r3;
+    __m256i temp0, temp1,tmp0, tmp1, tmp2, tmp3;
+    __m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0);
+    __m256i mult_val = _mm256_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
+    __m256i zero =  _mm256_setzero_si256();
+
+    __m128i t0 ,t1;
+    UNUSED (pi4_tmp);
+
+    src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
+
+    temp0 = _mm256_unpacklo_epi64(src_r0_r1, zero);  
+    temp1 = _mm256_unpackhi_epi64(src_r0_r1, zero);        // b0 b1 b2..         d0 d1...
+    temp0 = _mm256_unpacklo_epi16(temp0, temp1); 
+    tmp0 =  _mm256_permute2x128_si256(temp0,zero,0x20);    //tmp0 tmp3
+    tmp1 =  _mm256_permute2x128_si256(temp0,zero,0x31);    //tmp1  tmp2
+
+    temp0 = _mm256_unpacklo_epi32(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
+    temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
+    
+    temp1 = _mm256_shuffle_epi32(temp1,0b01001110);
+    tmp0 = _mm256_add_epi16(temp0, temp1);
+    tmp1 = _mm256_sub_epi16(temp0, temp1);
+
+    temp0 =   _mm256_unpacklo_epi64(tmp0, tmp1);
+    temp1 =   _mm256_unpackhi_epi64(tmp0, tmp1);
+    tmp0 = _mm256_add_epi16(temp0, temp1);
+    tmp1 = _mm256_sub_epi16(temp0, temp1);
+
+    temp0 = _mm256_unpacklo_epi32(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
+    temp1 = _mm256_unpackhi_epi32(tmp0, tmp1); 
+    
+    temp0 = _mm256_unpacklo_epi64(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
+    temp1 = _mm256_unpackhi_epi64(tmp0, tmp1);
+   
+    tmp0 = _mm256_unpacklo_epi16(temp0, temp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
+    tmp1 = _mm256_unpackhi_epi16(temp0, temp1);
+
+    temp0 = _mm256_unpacklo_epi32(tmp0, tmp1);             //a0 c0 a1 c1 a2 c2 a3 c3 a0 c0 a1 c1 b0 d0 b1 c1
+    temp1 = _mm256_unpackhi_epi32(tmp0, tmp1);
+    
+    temp1 = _mm256_shuffle_epi32(temp1, _MM_SHUFFLE(1, 0, 3, 2));
+    tmp0 = _mm256_add_epi16(temp0, temp1);
+    tmp1 = _mm256_sub_epi16(temp0, temp1);
+    temp0 =   _mm256_unpacklo_epi64(tmp0, tmp1);
+    temp1 =   _mm256_unpackhi_epi64(tmp0, tmp1);
+    tmp0 = _mm256_add_epi16(temp0, temp1);
+    tmp1 = _mm256_sub_epi16(temp0, temp1);
+
+    temp0 =   _mm256_unpacklo_epi64(tmp0, tmp1);
+    temp1 =   _mm256_unpackhi_epi64(tmp0, tmp1);
+
+
+    r0_r1 =_mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp0));
+    r2_r3 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(temp1));
+
+    src_r0_r1 = _mm256_mullo_epi32(r0_r1, mult_val);
+    src_r2_r3 = _mm256_mullo_epi32(r2_r3, mult_val);
+
+    //Scaling
+    if(u4_qp_div_6 >= 6)
+    {
+        src_r0_r1 = _mm256_slli_epi32(src_r0_r1, u4_qp_div_6 - 6);
+        src_r2_r3 = _mm256_slli_epi32(src_r2_r3, u4_qp_div_6 - 6);
+    }
+    else
+    {
+        temp0  = _mm256_add_epi32(src_r0_r1, add_rshift);
+        temp1  = _mm256_add_epi32(src_r2_r3, add_rshift);
+        src_r0_r1 = _mm256_srai_epi32(temp0, 6 - u4_qp_div_6);
+        src_r2_r3 = _mm256_srai_epi32(temp1, 6 - u4_qp_div_6);
+    }
+
+    src = _mm256_packs_epi32(src_r0_r1, src_r2_r3);
+    _mm256_storeu_si256((__m256i *) (&pi2_out[0]), src);
+}
+
--- a/common/x86/ih264_inter_pred_filters_avx2.c
+++ b/common/x86/ih264_inter_pred_filters_avx2.c
@ -0,0 +1,959 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+#ifdef __ANDROID__
+#include "log/log.h"
+#include <cutils/log.h>
+#endif
+
+#include <immintrin.h>
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_inter_pred_filters.h"
+
+/*****************************************************************************/
+/* Constant Data variables                                                   */
+/*****************************************************************************/
+
+/* coefficients for 6 tap filtering*/
+//const WORD32 ih264_g_six_tap[3] ={1,-5,20};
+/*****************************************************************************/
+/*  Function definitions .                                                   */
+/*****************************************************************************/
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : ih264_inter_pred_luma_copy_avx2                          */
+/*                                                                           */
+/*  Description   : This function copies the contents of ht x wd block from  */
+/*                  source to destination. (ht,wd) can be (4,4), (8,4),      */
+/*                  (4,8), (8,8), (16,8), (8,16) or (16,16).                 */
+/*                                                                           */
+/*  Inputs        : puc_src  - pointer to source                             */
+/*                  puc_dst  - pointer to destination                        */
+/*                  src_strd - stride for source                             */
+/*                  dst_strd - stride for destination                        */
+/*                  ht       - height of the block                           */
+/*                  wd       - width of the block                            */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         13 02 2015   Kaushik         Initial Version                      */
+/*                      Senthoor                                             */
+/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
+/*****************************************************************************/
+void ih264_inter_pred_luma_copy_avx2(UWORD8 *pu1_src,
+                                      UWORD8 *pu1_dst,
+                                      WORD32 src_strd,
+                                      WORD32 dst_strd,
+                                      WORD32 ht,
+                                      WORD32 wd,
+                                      UWORD8* pu1_tmp,
+                                      WORD32 dydx)
+{
+    WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4;
+    UNUSED(pu1_tmp);
+    UNUSED(dydx);
+
+    src_strd2 = src_strd << 1;
+    dst_strd2 = dst_strd << 1;
+    src_strd4 = src_strd << 2;
+    dst_strd4 = dst_strd << 2;
+    src_strd3 = src_strd2 + src_strd;
+    dst_strd3 = dst_strd2 + dst_strd;
+    if(wd == 4)
+    {
+        do
+        {
+            *((WORD32 *)(pu1_dst)) =  *((WORD32 *)(pu1_src));
+            *((WORD32 *)(pu1_dst + dst_strd)) = *((WORD32 *)(pu1_src + src_strd));
+            *((WORD32 *)(pu1_dst + dst_strd2)) = *((WORD32 *)(pu1_src + src_strd2));
+            *((WORD32 *)(pu1_dst + dst_strd3)) = *((WORD32 *)(pu1_src + src_strd3));
+
+            ht -= 4;
+            pu1_src += src_strd4;
+            pu1_dst += dst_strd4;
+        }
+        while(ht > 0);
+    }
+    else if(wd == 8)
+    {
+        __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
+        do
+        {
+
+            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
+            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
+            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2));
+            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3));
+
+            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b);
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b);
+
+            ht -= 4;
+            pu1_src += src_strd4;
+            pu1_dst += dst_strd4;
+        }
+        while(ht > 0);
+    }
+    else // wd == 16
+    {
+        __m256i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
+        WORD32 src_strd5, src_strd6, src_strd7, src_strd8;
+        WORD32 dst_strd5, dst_strd6, dst_strd7, dst_strd8;
+
+        __m256i y_4_16x8b, y_5_16x8b, y_6_16x8b, y_7_16x8b,y_0_1,y_2_3,y_4_5,y_6_7;
+
+
+        src_strd5 = src_strd2 + src_strd3;
+        dst_strd5 = dst_strd2 + dst_strd3;
+        src_strd6 = src_strd3 << 1;
+        dst_strd6 = dst_strd3 << 1;
+        src_strd7 = src_strd3 + src_strd4;
+        dst_strd7 = dst_strd3 + dst_strd4;
+        src_strd8 = src_strd << 3;
+        dst_strd8 = dst_strd << 3;
+
+        do
+        {
+
+            y_0_1 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd),(__m128i *)pu1_src);
+            y_2_3 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd3),(__m128i *)(pu1_src + src_strd2));
+            y_4_5 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd5),(__m128i *)(pu1_src + src_strd4));
+            y_6_7 = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd7),(__m128i *)(pu1_src + src_strd6));
+
+            _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)pu1_dst,y_0_1);
+            _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd3),(__m128i *)(pu1_dst + dst_strd2),y_2_3);
+            _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd5),(__m128i *)(pu1_dst + dst_strd4),y_4_5);
+            _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd7),(__m128i *)(pu1_dst + dst_strd6),y_6_7);
+
+            ht -= 8;
+            pu1_src += src_strd8;
+            pu1_dst += dst_strd8;
+        }
+        while(ht > 0);
+    }
+}
+
+
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : ih264_inter_pred_chroma_avx2                           */
+/*                                                                           */
+/*  Description   : This function implements a four-tap 2D filter as         */
+/*                  mentioned in sec. 8.4.2.2.2 titled "Chroma sample        */
+/*                  "interpolation process". (ht,wd) can be (2,2), (4,2),    */
+/*                  (2,4), (4,4), (8,4), (4,8) or (8,8).                     */
+/*                                                                           */
+/*  Inputs        : puc_src  - pointer to source                             */
+/*                  puc_dst  - pointer to destination                        */
+/*                  src_strd - stride for source                             */
+/*                  dst_strd - stride for destination                        */
+/*                  dx       - x position of destination value               */
+/*                  dy       - y position of destination value               */
+/*                  ht       - height of the block                           */
+/*                  wd       - width of the block                            */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         13 02 2015   Kaushik         Initial Version                      */
+/*                      Senthoor                                             */
+/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
+/*****************************************************************************/
+void ih264_inter_pred_chroma_avx2(UWORD8 *pu1_src,
+                                   UWORD8 *pu1_dst,
+                                   WORD32 src_strd,
+                                   WORD32 dst_strd,
+                                   WORD32 dx,
+                                   WORD32 dy,
+                                   WORD32 ht,
+                                   WORD32 wd)
+{
+
+    WORD32 i, j, A, B, C, D;
+    i = 8 - dx;
+    j = 8 - dy;
+
+    A = i * j;
+    B = dx * j;
+    C = i * dy;
+    D = dx * dy;
+    if(wd == 2)
+    {
+        WORD32 tmp1, tmp2, tmp3, tmp4;
+
+        do
+        {
+            //U
+            tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
+            tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
+            //V
+            tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
+            tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];
+
+            tmp1 = (tmp1 + 32) >> 6;
+            tmp2 = (tmp2 + 32) >> 6;
+            tmp3 = (tmp3 + 32) >> 6;
+            tmp4 = (tmp4 + 32) >> 6;
+
+            pu1_dst[0] = CLIP_U8(tmp1);
+            pu1_dst[2] = CLIP_U8(tmp2);
+            pu1_dst[1] = CLIP_U8(tmp3);
+            pu1_dst[3] = CLIP_U8(tmp4);
+
+            pu1_src += src_strd;
+            pu1_dst += dst_strd;
+
+            tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
+            tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
+            tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
+            tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];
+
+            tmp1 = (tmp1 + 32) >> 6;
+            tmp2 = (tmp2 + 32) >> 6;
+            tmp3 = (tmp3 + 32) >> 6;
+            tmp4 = (tmp4 + 32) >> 6;
+
+            pu1_dst[0] = CLIP_U8(tmp1);
+            pu1_dst[2] = CLIP_U8(tmp2);
+            pu1_dst[1] = CLIP_U8(tmp3);
+            pu1_dst[3] = CLIP_U8(tmp4);
+
+            ht -= 2;
+            pu1_src += src_strd;
+            pu1_dst += dst_strd;
+        }
+        while(ht > 0);
+
+    }
+    else if(wd == 4)
+    {
+        WORD32 AB, CD;
+
+        __m256i coeffAB_32x8b, coeffCD_32x8b, round_add32_8x32b;
+        __m256i const_shuff_32x8b;
+
+        __m256i  src_r23_32x8b,src_r12_32x8b,res12_AB_8x32b,res12_CD_8x32b,res1_8x32b;
+        __m128i res1,src_r1_16x8b_128;
+        __m128i const_shuff_16x8b_128;
+
+        AB = (B << 8) + A;
+        CD = (D << 8) + C;
+
+        coeffAB_32x8b = _mm256_set1_epi16(AB);
+        coeffCD_32x8b = _mm256_set1_epi16(CD);
+
+        round_add32_8x32b = _mm256_set1_epi16(32);
+        const_shuff_16x8b_128 = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806);
+        const_shuff_32x8b = _mm256_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806,0x03010200, 0x05030402, 0x07050604, 0x09070806);
+
+
+        src_r1_16x8b_128 = _mm_loadu_si128((__m128i *)pu1_src);
+        src_r1_16x8b_128 = _mm_shuffle_epi8(src_r1_16x8b_128, const_shuff_16x8b_128);
+        pu1_src += src_strd;
+        do
+        {
+
+            src_r23_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + src_strd),(__m128i *)pu1_src);
+            src_r23_32x8b = _mm256_shuffle_epi8(src_r23_32x8b, const_shuff_32x8b);
+            src_r12_32x8b = _mm256_set_m128i(_mm256_castsi256_si128(src_r23_32x8b),src_r1_16x8b_128);
+
+            res12_AB_8x32b = _mm256_maddubs_epi16(src_r12_32x8b, coeffAB_32x8b);
+            res12_CD_8x32b = _mm256_maddubs_epi16(src_r23_32x8b, coeffCD_32x8b);
+
+            res1_8x32b = _mm256_add_epi16(res12_AB_8x32b, res12_CD_8x32b);
+            res1_8x32b = _mm256_add_epi16(res1_8x32b, round_add32_8x32b);
+
+            res1_8x32b = _mm256_srai_epi16(res1_8x32b, 6);
+
+            res1_8x32b = _mm256_packus_epi16(res1_8x32b,res1_8x32b);
+            res1_8x32b = _mm256_permute4x64_epi64(res1_8x32b, 0xD8);
+            res1  = _mm256_castsi256_si128(res1_8x32b);
+            _mm_storel_epi64((__m128i *)pu1_dst, res1);
+
+            res1 = _mm_srli_si128(res1, 8);
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res1);
+
+            src_r1_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(src_r23_32x8b,src_r23_32x8b,0x1));;
+
+            ht -= 2;
+            pu1_src += src_strd << 1;
+            pu1_dst += dst_strd << 1;
+
+        }
+        while(ht > 0);
+    }
+    else // wd == 8
+    {
+
+        WORD32 AB, CD;
+        __m256i src_r1lh_32x8b, src_r2lh_32x8b;
+
+        __m256i res_lh_AB_8x32b, res_lh_CD_8x32b,res_1lh_8x32b,res_2lh_8x32b;
+        __m256i res_lh_8x32b, res_l_8x32b,res_h_8x32b, res_32x8b;
+
+        __m256i coeffAB_32x8b, coeffCD_32x8b, round_add32_8x32b;
+        __m256i const_shuff_32x8b;
+        __m256i zero = _mm256_setzero_si256();
+        __m128i res_1,res_2;
+
+        AB = (B << 8) + A;
+        CD = (D << 8) + C;
+
+        coeffAB_32x8b = _mm256_set1_epi16(AB);
+        coeffCD_32x8b = _mm256_set1_epi16(CD);
+
+        round_add32_8x32b = _mm256_set1_epi16(32);
+
+        const_shuff_32x8b = _mm256_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806,0x03010200, 0x05030402, 0x07050604, 0x09070806);
+
+        src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
+        src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b);
+        pu1_src += src_strd;
+
+        do
+        {
+            //row 1
+
+            src_r2lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
+            src_r2lh_32x8b = _mm256_shuffle_epi8(src_r2lh_32x8b, const_shuff_32x8b);
+            res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffAB_32x8b);
+
+            res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffCD_32x8b);
+            res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
+            res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
+            res_1lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
+
+            pu1_src += src_strd;
+            //row 2
+            src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
+            src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b);
+
+            res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffAB_32x8b);
+            res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffCD_32x8b);
+
+            res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
+            res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
+
+            res_2lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
+
+            res_1lh_8x32b =  _mm256_packus_epi16(res_1lh_8x32b, res_2lh_8x32b);
+            res_1lh_8x32b = _mm256_permute4x64_epi64(res_1lh_8x32b, 0xD8);
+            _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)(pu1_dst),res_1lh_8x32b);
+
+            pu1_src += src_strd;
+            pu1_dst += dst_strd;
+            pu1_dst += dst_strd;
+
+            //row 3
+            src_r2lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
+            src_r2lh_32x8b = _mm256_shuffle_epi8(src_r2lh_32x8b, const_shuff_32x8b);
+
+            res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffAB_32x8b);
+            res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffCD_32x8b);
+
+            res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
+            res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
+
+            res_1lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
+            pu1_src += src_strd;
+
+            //row 1
+            src_r1lh_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_src + 8),(__m128i *)pu1_src);
+            src_r1lh_32x8b = _mm256_shuffle_epi8(src_r1lh_32x8b, const_shuff_32x8b);
+
+            res_lh_AB_8x32b = _mm256_maddubs_epi16(src_r2lh_32x8b, coeffAB_32x8b);
+            res_lh_CD_8x32b = _mm256_maddubs_epi16(src_r1lh_32x8b, coeffCD_32x8b);
+
+            res_lh_8x32b = _mm256_add_epi16(res_lh_AB_8x32b, round_add32_8x32b);
+            res_lh_8x32b = _mm256_add_epi16(res_lh_8x32b, res_lh_CD_8x32b);
+
+            res_2lh_8x32b = _mm256_srai_epi16(res_lh_8x32b, 6);
+
+            res_1lh_8x32b =  _mm256_packus_epi16(res_1lh_8x32b, res_2lh_8x32b);
+            res_1lh_8x32b = _mm256_permute4x64_epi64(res_1lh_8x32b, 0xD8);
+            _mm256_storeu2_m128i((__m128i *)(pu1_dst + dst_strd),(__m128i *)(pu1_dst),res_1lh_8x32b);
+
+
+            ht -= 4;
+            pu1_src += src_strd;
+            pu1_dst += dst_strd;
+            pu1_dst += dst_strd;
+        }
+        while(ht > 0);
+    }
+}
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2           */
+/*                                                                           */
+/*  Description   : This function implements a six-tap filter vertically and */
+/*                  horizontally on ht x wd block separately and averages    */
+/*                  the two sets of values to calculate values at (1/4,1/4), */
+/*                  (1/4, 3/4), (3/4, 1/4) or (3/4, 3/4) as mentioned in     */
+/*                  sec. 8.4.2.2.1 titled "Luma sample interpolation         */
+/*                  process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8),     */
+/*                  (16,8), (8,16) or (16,16).                               */
+/*                                                                           */
+/*  Inputs        : puc_src  - pointer to source                             */
+/*                  puc_dst  - pointer to destination                        */
+/*                  src_strd - stride for source                             */
+/*                  dst_strd - stride for destination                        */
+/*                  ht       - height of the block                           */
+/*                  wd       - width of the block                            */
+/*                  pu1_tmp  - pointer to temporary buffer                   */
+/*                  dydx     - x and y reference offset for q-pel            */
+/*                             calculations                                  */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         13 02 2015   Kaushik         Initial Version                      */
+/*                      Senthoor                                             */
+/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
+/*****************************************************************************/
+void ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2(UWORD8 *pu1_src,
+                                                     UWORD8 *pu1_dst,
+                                                     WORD32 src_strd,
+                                                     WORD32 dst_strd,
+                                                     WORD32 ht,
+                                                     WORD32 wd,
+                                                     UWORD8* pu1_tmp,
+                                                     WORD32 dydx)
+{
+    WORD32 ht_temp;
+    UWORD8 *pu1_pred_vert,*pu1_pred_horiz;
+    UWORD8 *pu1_tmp1, *pu1_tmp2;
+    WORD32 x_offset, y_offset;
+
+    __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
+    __m128i const_val16_8x16b;
+    __m256i coeff0_1_32x8b, coeff2_3_32x8b, coeff4_5_32x8b,coeff_32x8b;
+    __m256i const_val16_8x32b;
+    __m256i zero = _mm256_setzero_si256();
+    pu1_tmp1 = pu1_tmp;
+
+    dydx &= 0xf;
+    ht_temp = ht;
+    x_offset = dydx & 0x3;
+    y_offset = dydx >> 2;
+    pu1_tmp2 = pu1_tmp1;
+
+    pu1_pred_vert  = pu1_src + (x_offset >> 1) - 2*src_strd;
+    pu1_pred_horiz = pu1_src + (y_offset >> 1) * src_strd - 2;
+    //the filter input starts from x[-2] (till x[3])
+
+    coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+    coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+    coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+                                                  //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
+    const_val16_8x16b = _mm_set1_epi16(16);
+    coeff_32x8b = _mm256_set_m128i(coeff2_3_16x8b,coeff0_1_16x8b);
+    coeff0_1_32x8b = _mm256_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
+    coeff2_3_32x8b = _mm256_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
+    coeff4_5_32x8b = _mm256_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
+                                                     //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
+    const_val16_8x32b = _mm256_set1_epi16(16);
+
+    if(wd == 4)
+    {
+        //vertical q-pel filter
+        {
+            __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
+            __m128i src_r5_16x8b, src_r6_16x8b;
+            __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
+
+            __m128i res_r0r1_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
+
+            //epilogue: Load all the pred rows except sixth  and seventh row for the
+            //first and second row processing.
+            src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+            pu1_pred_vert = pu1_pred_vert + src_strd;
+
+            src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+            pu1_pred_vert = pu1_pred_vert + src_strd;
+            src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
+
+            src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+            pu1_pred_vert = pu1_pred_vert + src_strd;
+            src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
+
+            src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+            pu1_pred_vert = pu1_pred_vert + src_strd;
+            src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
+
+            src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+            pu1_pred_vert = pu1_pred_vert + src_strd;
+            src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
+
+            //Core Loop: Process all the rows.
+            do
+            {
+                src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+                src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
+
+                src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
+                src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
+
+                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+                res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+                res_r0r1_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+
+                _mm_storel_epi64((__m128i *)pu1_tmp1, res_r0r1_16x8b);
+
+                src_r0_16x8b = src_r2_16x8b;
+                src_r1_16x8b = src_r3_16x8b;
+                src_r2_16x8b = src_r4_16x8b;
+                src_r3_16x8b = src_r5_16x8b;
+                src_r4_16x8b = src_r6_16x8b;
+
+                ht_temp -= 2;
+                pu1_pred_vert += src_strd << 1;
+                pu1_tmp1 += 8;
+            }
+            while(ht_temp > 0);
+        }
+
+        //horizontal q-pel filter
+        {
+            __m128i res_r0r1_16x8b_128, src_r0r1_vpel_16x8b,res_16x8b;
+            __m256i src_r0r1_sht_32x8b, src_r0r1_32x8b,src_r0r1_t1_32x8b;
+
+            __m256i res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b;
+            __m256i res_r0r1_32x8b;
+
+            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+            //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+            do
+            {
+                src_r0r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)pu1_tmp2);
+                src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + src_strd), (__m128i *)pu1_pred_horiz);
+
+                src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1);
+                src_r0r1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b);
+
+                src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero);                //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
+                res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+                                                                                                //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+
+                src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 4);                              //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
+                                                                                                 //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0
+
+                src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero);                //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
+                res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+                                                                                                //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
+
+                src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 4);                              //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
+                                                                                                //b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0  0  0  0  0
+
+                src_r0r1_t1_32x8b = _mm256_unpacklo_epi64(src_r0r1_32x8b, zero);                //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
+                res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+                                                                                                //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
+
+                res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b);
+                res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b);
+                res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b);     //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 15;
+                                                                                                //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 15;
+                                                                                                //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 15;
+                                                                                                //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 15;
+                                                                                                //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 15;
+                                                                                                //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 15;
+                                                                                                //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 15;
+                                                                                                //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 15;
+                res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5);                       //shifting right by 5 bits.
+
+                res_r0r1_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(res_r0r1_t1_8x32b,
+                                                                           res_r0r1_t1_8x32b));
+
+                res_16x8b = _mm_avg_epu8(res_r0r1_16x8b_128,src_r0r1_vpel_16x8b);
+
+                *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
+                res_16x8b = _mm_srli_si128(res_16x8b, 4);
+                *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
+
+                ht -= 2;
+                pu1_pred_horiz += src_strd << 1;
+                pu1_tmp2 += 8;
+                pu1_dst += dst_strd << 1;
+            }
+            while(ht > 0);
+        }
+    }
+    else if(wd == 8)
+    {
+        //vertical q-pel filter
+        {
+            __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
+            __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b;
+            __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
+
+            __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
+
+            //epilogue: Load all the pred rows except sixth  and seventh row for the
+            //first and second row processing.
+            src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+            pu1_pred_vert = pu1_pred_vert + src_strd;
+
+            src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+            pu1_pred_vert = pu1_pred_vert + src_strd;
+            src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
+
+            src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+            pu1_pred_vert = pu1_pred_vert + src_strd;
+            src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
+
+            src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+            pu1_pred_vert = pu1_pred_vert + src_strd;
+            src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
+
+            src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+            pu1_pred_vert = pu1_pred_vert + src_strd;
+            src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
+
+            //Core Loop: Process all the rows.
+            do
+            {
+                src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
+                src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
+
+                src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
+                src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
+
+                src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
+                src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
+                src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+
+                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+                res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+                res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+
+                _mm_storel_epi64((__m128i *)(pu1_tmp1), res_16x8b);
+
+                src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
+                src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
+                src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
+
+                res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
+                res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
+                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
+                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+                res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
+
+                res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+                res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
+
+                _mm_storel_epi64((__m128i *)(pu1_tmp1 + 8), res_16x8b);
+
+                src_r0_16x8b = src_r2_16x8b;
+                src_r1_16x8b = src_r3_16x8b;
+                src_r2_16x8b = src_r4_16x8b;
+                src_r3_16x8b = src_r5_16x8b;
+                src_r4_16x8b = src_r6_16x8b;
+
+                ht_temp -= 2;
+                pu1_pred_vert += src_strd << 1;
+                pu1_tmp1 += 16;
+            }
+            while(ht_temp > 0);
+        }
+
+        //horizontal q-pel filter
+        {
+
+            __m256i src_r0r1_32x8b, src_r0r1_sht_32x8b,src_r0r1_t1_32x8b,res_32x8b;
+            __m128i src_r0r1_vpel_128,res_16x8b_128;
+            __m256i src_r0r1_vpel_32x8b,res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b;
+
+            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+            //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+
+            do
+            {
+                src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + src_strd),    //a2 a3 a4 a5 a6 a7 a8....a15 0 or
+                                                     (__m128i *)pu1_pred_horiz);                //a3 a4 a5 a6 a7 a8 a9....a15 0
+                                                                                                //b2 b3 b4 b5 b6 b7 b8....b15 0 or
+                                                                                                //b3 b4 b5 b6 b7 b8 b9....b15 0
+
+                src_r0r1_vpel_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_tmp2 + 8),
+                                                         (__m128i *)(pu1_tmp2));
+                src_r0r1_vpel_32x8b = _mm256_unpacklo_epi64(src_r0r1_vpel_32x8b,zero);
+
+                src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1);                        //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+                                                                                              //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+                src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b);       //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+
+                res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b);        //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+                                                                                                    //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+
+                                                                                                    //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+                src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2);                              //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+                                                                                                    //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+                src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2);                      //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
+
+                src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b);       //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+                                                                                                    //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+                res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b);      //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+                                                                                                  //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+                                                                                                  //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+                                                                                                  //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+                src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2);                            //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
+
+                src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2);                    //a5 a6 a7 a8 a9....a15 0  0  0  0  0
+                                                                                                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0
+
+                src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b);       //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+                                                                                                    //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+                res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b);      //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+                                                                                                  //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+                                                                                                  //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+
+                res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b);
+                res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b);
+                res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b);
+                res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5);                      //shifting right by 5 bits.
+
+                res_32x8b = _mm256_packus_epi16(res_r0r1_t1_8x32b, res_r0r1_t1_8x32b);
+
+                res_32x8b = _mm256_avg_epu8(res_32x8b,src_r0r1_vpel_32x8b);
+
+                _mm_storel_epi64((__m128i *)(pu1_dst), _mm256_castsi256_si128(res_32x8b));
+                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), _mm256_castsi256_si128(_mm256_permute2x128_si256(res_32x8b,res_32x8b,0x1)));
+
+                ht -= 2;
+                pu1_pred_horiz += src_strd << 1;
+                pu1_dst += dst_strd << 1;
+                pu1_tmp2 += 16;
+            }
+            while(ht > 0);
+        }
+    }
+    else // wd == 16
+    {
+       //vertical q-pel filter
+        {
+            __m256i src_r0r2_32x8b, src_r1r3_32x8b, src_r2r4_32x8b,src_32x8b;
+            __m128i src_r2_16x8b,src_r4_16x8b,src_r5_16x8b,src_r6_16x8b,src_r4r5_16x8b;
+
+            __m256i res_t1_8x32b;
+            __m128i res_t0_8x16b,res_t3_8x16b,res_t1_8x16b,res_16x8b;
+
+            //epilogue: Load all the pred rows except sixth  and seventh row for the
+            //first and second row processing.
+            src_r0r2_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_vert + 2 * src_strd ),
+                                               (__m128i *)(pu1_pred_vert));
+
+            src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + 2 * src_strd));
+            src_r1r3_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_vert + 3 * src_strd ),
+                                               (__m128i *)(pu1_pred_vert + src_strd));
+            src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + 4 * src_strd));
+            pu1_pred_vert =  pu1_pred_vert + (5 * src_strd ) ;
+
+
+            //Core Loop: Process all the rows.
+            do
+            {
+                src_r5_16x8b =  _mm_loadu_si128((__m128i *)(pu1_pred_vert));
+                src_r6_16x8b =  _mm_loadu_si128((__m128i *)(pu1_pred_vert + src_strd));
+                src_r2r4_32x8b = _mm256_set_m128i(src_r4_16x8b,src_r2_16x8b);
+
+                src_32x8b = _mm256_unpacklo_epi8(src_r0r2_32x8b, src_r1r3_32x8b);
+                src_r4r5_16x8b =  _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
+                
+                res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
+                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+                res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256                                                (res_t1_8x32b,0x1));
+                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+                res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits
+
+
+                src_32x8b = _mm256_unpackhi_epi8(src_r0r2_32x8b, src_r1r3_32x8b);
+                src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
+
+                res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
+                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+                res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b),_mm256_extracti128_si256                                                (res_t1_8x32b,0x1));
+                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+                res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
+
+                res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
+                _mm_storeu_si128((__m128i *)(pu1_tmp1), res_16x8b);
+
+                src_32x8b = _mm256_unpacklo_epi8(src_r1r3_32x8b, src_r2r4_32x8b);
+                src_r4r5_16x8b =  _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
+
+                res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
+                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+                res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256                                                (res_t1_8x32b,0x1));
+                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+                 res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits
+
+
+                src_32x8b = _mm256_unpackhi_epi8(src_r1r3_32x8b, src_r2r4_32x8b);
+                src_r4r5_16x8b =  _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
+
+                res_t1_8x32b = _mm256_maddubs_epi16(src_32x8b, coeff_32x8b);
+                res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
+
+                res_t1_8x16b = _mm_add_epi16(_mm256_castsi256_si128(res_t1_8x32b), _mm256_extracti128_si256                                                (res_t1_8x32b,0x1));
+                res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
+                res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
+                res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits
+
+
+                res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
+                _mm_storeu_si128((__m128i *)(pu1_tmp1+16), res_16x8b);
+                 src_r0r2_32x8b = src_r2r4_32x8b;
+                src_r1r3_32x8b = _mm256_set_m128i(src_r5_16x8b,_mm256_extracti128_si256(src_r1r3_32x8b,0x1));
+                src_r2_16x8b = src_r4_16x8b;
+                src_r4_16x8b = src_r6_16x8b;
+
+                ht_temp -= 2;
+                pu1_pred_vert += src_strd << 1;
+                pu1_tmp1 += 32;
+            }
+            while(ht_temp > 0);
+        }
+        //horizontal q-pel filter
+        {
+
+            __m256i src_r0r1_32x8b,src_r0r1_sht_32x8b,src_r0r1_t1_32x8b,res_32x8b;
+            __m128i src_vpel_16x8b,res_16x8b_128;
+
+            __m256i res_r0r1_t1_8x32b, res_r0r1_t2_8x32b, res_r0r1_t3_8x32b;
+
+            //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
+            //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
+            //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
+
+            do
+            {
+                src_r0r1_32x8b = _mm256_loadu2_m128i((__m128i *)(pu1_pred_horiz + 8),
+                                                    (__m128i *)(pu1_pred_horiz));        //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
+                                                                                         //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
+                src_vpel_16x8b = _mm_loadu_si128((__m128i *)(pu1_tmp2));
+
+                src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
+                                                                                             //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
+
+                src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
+                                                                                                  //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
+
+                res_r0r1_t1_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff0_1_32x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
+                                                                                                //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
+                                                                                                //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
+                                                                                                //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
+
+                src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
+                                                                                                //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
+
+                src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
+                                                                                                //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
+
+                src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
+                                                                                                //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
+
+                res_r0r1_t2_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff2_3_32x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
+                                                                                         //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
+                                                                                         //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
+                                                                                         //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
+
+                src_r0r1_32x8b = _mm256_srli_si256(src_r0r1_32x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
+                                                                                                //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
+
+                src_r0r1_sht_32x8b = _mm256_srli_si256(src_r0r1_sht_32x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
+                                                                                                //b5 b6 b7 b8 b9....b15 0  0  0  0  0
+
+                src_r0r1_t1_32x8b = _mm256_unpacklo_epi8(src_r0r1_32x8b, src_r0r1_sht_32x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
+                                                                                             //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
+
+                res_r0r1_t3_8x32b = _mm256_maddubs_epi16(src_r0r1_t1_32x8b, coeff4_5_32x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
+                                                                                         //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
+                                                                                         //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
+                                                                                         //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
+                res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t2_8x32b);
+                res_r0r1_t3_8x32b = _mm256_add_epi16(const_val16_8x32b, res_r0r1_t3_8x32b);
+                res_r0r1_t1_8x32b = _mm256_add_epi16(res_r0r1_t1_8x32b, res_r0r1_t3_8x32b);
+                res_r0r1_t1_8x32b = _mm256_srai_epi16(res_r0r1_t1_8x32b, 5);                    //shifting right by 5 bits.
+
+
+                res_32x8b = _mm256_packus_epi16(res_r0r1_t1_8x32b, res_r0r1_t1_8x32b);
+                res_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(res_32x8b, 0xD8));
+                res_16x8b_128 = _mm_avg_epu8(res_16x8b_128, src_vpel_16x8b);
+                _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b_128);
+
+                ht --;
+                pu1_pred_horiz  += src_strd;
+                pu1_dst += dst_strd;
+                pu1_tmp2 += 16;
+            }
+            while(ht > 0);
+        }
+    }
+}
--- a/common/x86/ih264_iquant_itrans_recon_avx2.c
+++ b/common/x86/ih264_iquant_itrans_recon_avx2.c
@ -0,0 +1,303 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+ *******************************************************************************
+ * @file
+ *  ih264_iquant_itrans_recon_avx2.c
+ *
+ * @brief
+ *  Contains function definitions for inverse  quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ *  Priyanka Bose
+ *
+ * @par List of Functions:
+ *  - ih264_iquant_itrans_recon_4x4_avx2()
+ *
+ * @remarks
+ *  None
+ *
+ *******************************************************************************
+ */
+
+
+#include <stdio.h>
+
+/* User include files */
+#include "ih264_typedefs.h"
+#include "ih264_defs.h"
+#include "ih264_trans_macros.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_trans_data.h"
+#include "ih264_size_defs.h"
+#include "ih264_structs.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include <immintrin.h>
+
+
+/*
+ ********************************************************************************
+ *
+ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
+ * prediction buffer
+ *
+ * @par Description:
+ *  The quantized residue is first inverse quantized, then inverse transformed.
+ *  This inverse transformed content is added to the prediction buffer to recon-
+ *  struct the end output
+ *
+ * @param[in] pi2_src
+ *  quantized 4x4 block
+ *
+ * @param[in] pu1_pred
+ *  prediction 4x4 block
+ *
+ * @param[out] pu1_out
+ *  reconstructed 4x4 block
+ *
+ * @param[in] src_strd
+ *  quantization buffer stride
+ *
+ * @param[in] pred_strd,
+ *  Prediction buffer stride
+ *
+ * @param[in] out_strd
+ *  recon buffer Stride
+ *
+ * @param[in] pu2_scaling_list
+ *  pointer to scaling list
+ *
+ * @param[in] pu2_norm_adjust
+ *  pointer to inverse scale matrix
+ *
+ * @param[in] u4_qp_div_6
+ *  Floor (qp/6)
+ *
+ * @param[in] pi4_tmp
+ * temporary buffer of size 1*16
+ *
+ * @returns none
+ *
+ * @remarks none
+ *
+ *******************************************************************************
+ */
+void ih264_iquant_itrans_recon_4x4_avx2(WORD16 *pi2_src,
+                                   UWORD8 *pu1_pred,
+                                   UWORD8 *pu1_out,
+                                   WORD32 pred_strd,
+                                   WORD32 out_strd,
+                                   const UWORD16 *pu2_iscal_mat,
+                                   const UWORD16 *pu2_weigh_mat,
+                                   UWORD32 u4_qp_div_6,
+                                   WORD16 *pi2_tmp,
+                                   WORD32 iq_start_idx,
+                                   WORD16 *pi2_dc_ld_addr)
+ {
+
+    UWORD32 *pu4_out = (UWORD32 *) pu1_out;
+    __m256i src_r0_r1, src_r2_r3;
+    __m256i src_r0, src_r1, src_r2, src_r3;
+    __m256i scalemat_r0_r1, scalemat_r2_r3;
+    __m128i pred_r0, pred_r1, pred_r2, pred_r3;
+    __m256i sign_reg, dequant_r0_r1, dequant_r2_r3;
+    __m256i zero_8x32b = _mm256_setzero_si256();          // all bits reset to zero
+    __m256i temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    __m256i resq_r0, resq_r1, resq_r2, resq_r3;
+    __m256i add_rshift = _mm256_set1_epi32((u4_qp_div_6 < 4) ? (1 << (3 - u4_qp_div_6)) : 0);
+    __m256i value_32 = _mm256_set1_epi32(32);
+    __m128i value_32_128 = _mm_set1_epi32(32);
+    __m128i r0,r1,r2,r3,t0,t1,t2,t3,t4,t5,t6,t7,sign_reg_128, de_r0_r1,de_r2_r3,r0_r1,r2_r3,scale_r0_r1,scale_r2_r3;
+    __m128i zero_8x16b_128 = _mm_setzero_si128();
+    UNUSED (pi2_tmp);
+
+    /*************************************************************/
+    /* Dequantization of coefficients. Will be replaced by SIMD  */
+    /* operations on platform                                    */
+    /*************************************************************/
+    src_r0_r1 = _mm256_loadu_si256((__m256i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
+    scalemat_r0_r1 = _mm256_loadu_si256((__m256i *) (pu2_iscal_mat)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
+    dequant_r0_r1 = _mm256_loadu_si256((__m256i *) (pu2_weigh_mat)); //q00 q01 q02 q03 q10 q11 q12 q13 -- all 16 bits
+
+    temp0 = _mm256_mullo_epi16(scalemat_r0_r1, dequant_r0_r1);
+    temp4 = _mm256_unpacklo_epi16(temp0, zero_8x32b); // b00*q00 0 b01*q01 0 b02*q02 0 b03*q03 0 -- 16 bit long
+    temp5 = _mm256_unpackhi_epi16(temp0, zero_8x32b); // b10*q10 0 b11*q11 0 b12*q12 0 b13*q13 0 -- 16 bit long
+
+    src_r0 = _mm256_unpacklo_epi16(src_r0_r1, zero_8x32b); // a00 0 a01 0 a02 0 a03 0 -- 16 bit long
+    src_r1 = _mm256_unpackhi_epi16(src_r0_r1, zero_8x32b); // a10 0 a11 0 a12 0 a13 0 -- 16 bit long
+
+    temp0 = _mm256_madd_epi16(src_r0, temp4); //a00*b00*q00 a10*b10*q10 a20*b20*q20 a30*b30 q30 -- 32 bits long
+    temp1 = _mm256_madd_epi16(src_r1, temp5);
+
+    if (u4_qp_div_6 >= 4) {
+        resq_r0 = _mm256_slli_epi32(temp0, u4_qp_div_6 - 4);
+        resq_r1 = _mm256_slli_epi32(temp1, u4_qp_div_6 - 4);
+    } else {
+        temp4 = _mm256_add_epi32(temp0, add_rshift);
+        temp5 = _mm256_add_epi32(temp1, add_rshift);
+        resq_r0 = _mm256_srai_epi32(temp0, 4 - u4_qp_div_6);
+        resq_r1 = _mm256_srai_epi32(temp1, 4 - u4_qp_div_6);
+    }
+
+    if (iq_start_idx == 1)
+        resq_r0 = _mm256_insert_epi32(resq_r0,(WORD32)pi2_dc_ld_addr[0],0);
+    /* Perform Inverse transform */
+
+    /* Perform Inverse transform */
+    /*-------------------------------------------------------------*/
+    /* IDCT [ Horizontal transformation ]                          */
+    /*-------------------------------------------------------------*/
+    // Matrix transpose
+    /*
+     *  a0 a1 a2 a3
+     *  b0 b1 b2 b3
+     *  c0 c1 c2 c3
+     *  d0 d1 d2 d3
+     */
+
+    temp0 = _mm256_unpacklo_epi32(resq_r0, resq_r1);                  //a0 c0 a1 c1  b0 d0 b1 d1
+    temp1 = _mm256_unpackhi_epi32(resq_r0, resq_r1);                  //a2 c2
+
+    resq_r0 = _mm256_permute2f128_si256(temp0, temp1, 0x20);
+    resq_r1 = _mm256_permute2f128_si256(temp0, temp1, 0x31);
+
+    temp0 = _mm256_unpacklo_epi64(resq_r0, resq_r1);                  // w0 w2
+    temp1 = _mm256_unpackhi_epi64(resq_r0, resq_r1);                  //  w1 w3
+
+    resq_r0 = _mm256_permute2f128_si256(temp0, temp1, 0x20);
+    resq_r1 = _mm256_permute2f128_si256(temp0, temp1, 0x31);
+
+    r0 = _mm256_extracti128_si256(resq_r0, 0x0);
+    r1 = _mm256_extracti128_si256(resq_r0, 0x1);
+    r2 = _mm256_extracti128_si256(resq_r1, 0x0);
+    r3 = _mm256_extracti128_si256(resq_r1, 0x1);
+
+    //Transform starts -- horizontal transform
+    /*------------------------------------------------------------------*/
+    /* z0 = w0 + w2                                             */
+    t0 = _mm_add_epi32(r0, r2);
+    /* z1 = w0 - w2                                             */
+    t1 = _mm_sub_epi32(r0, r2);
+    /* z2 = (w1 >> 1) - w3                                      */
+    t2 = _mm_srai_epi32(r1, 1);                         //(w1>>1)
+    t2 = _mm_sub_epi32(t2, r3);                      //(w1>>1) - w3
+    /* z3 = w1 + (w3 >> 1)                                      */
+    t3 = _mm_srai_epi32(r3, 1);                         //(w3>>1) + w1
+    t3 = _mm_add_epi32(t3, r1);
+    /*----------------------------------------------------------*/
+    /* x0 = z0 + z3                                             */
+    r0 = _mm_add_epi32(t0, t3);
+    /* x1 = z1 + z2                                             */
+    r1 = _mm_add_epi32(t1, t2);
+    /* x2 = z1 - z2                                             */
+    r2 = _mm_sub_epi32(t1, t2);
+    /* x3 = z0 - z3                                             */
+    r3 = _mm_sub_epi32(t0, t3);
+
+    t1 = _mm_unpacklo_epi32(r0, r1);                  //a0 a1 b0 b1
+    t3 = _mm_unpacklo_epi32(r2, r3);                  //a2 a3 b2 b3
+    t2 = _mm_unpackhi_epi32(r0, r1);                  //c0 c1 d0 d1
+    t4 = _mm_unpackhi_epi32(r2, r3);                  //c2 c3 d2 d3
+    r0 = _mm_unpacklo_epi64(t1, t3);                    //a0 a1 a2 a3
+    r1 = _mm_unpackhi_epi64(t1, t3);                    //b0 b1 b2 b3
+    r2 = _mm_unpacklo_epi64(t2, t4);                    //c0 c1 c2 c3
+    r3 = _mm_unpackhi_epi64(t2, t4);                    //d0 d1 d2 d3
+
+    //Transform ends -- horizontal transform
+
+    //Load pred buffer
+    pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
+    pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
+    pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
+    pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
+
+    pred_r0 = _mm_cvtepu8_epi32(pred_r0); //p00 p01 p02 p03 -- all 32 bits
+    pred_r1 = _mm_cvtepu8_epi32(pred_r1); //p10 p11 p12 p13 -- all 32 bits ///Need to look
+    pred_r2 = _mm_cvtepu8_epi32(pred_r2); //p20 p21 p22 p23 -- all 32 bits
+    pred_r3 = _mm_cvtepu8_epi32(pred_r3); //p30 p31 p32 p33 -- all 32 bits
+
+    /*--------------------------------------------------------------*/
+    /* IDCT [ Vertical transformation] and Xij = (xij + 32)>>6      */
+    /*                                                              */
+    /* Add the prediction and store it back to same buffer          */
+    /*--------------------------------------------------------------*/
+
+
+    t0 = _mm_add_epi32(r0, r2);
+    /* z1j = y0j - y2j                                                        */
+    t1 = _mm_sub_epi32(r0, r2);
+    /* z2j = (y1j>>1) - y3j                                                        */
+    t2 = _mm_srai_epi32(r1, 1);                             //(y1j>>1)
+    t2 = _mm_sub_epi32(t2, r3);
+    /* z3j = y1j + (y3j>>1)                                                        */
+    t3 = _mm_srai_epi32(r3, 1);                             //(y3j>>1)
+    t3 = _mm_add_epi32(r1, t3);
+
+
+    t4 = _mm_add_epi32(t0, t3);
+    t4 = _mm_add_epi32(t4, value_32_128);
+    t4 = _mm_srai_epi32(t4, 6);
+    t4 = _mm_add_epi32(t4, pred_r0);
+
+    t5 = _mm_add_epi32(t1, t2);
+    t5 = _mm_add_epi32(t5, value_32_128);
+    t5 = _mm_srai_epi32(t5, 6);
+    t5 = _mm_add_epi32(t5, pred_r1);
+
+    t6 = _mm_sub_epi32(t1, t2);
+    t6 = _mm_add_epi32(t6, value_32_128);
+    t6 = _mm_srai_epi32(t6, 6);
+    t6 = _mm_add_epi32(t6, pred_r2);
+
+    t7 = _mm_sub_epi32(t0, t3);
+    t7 = _mm_add_epi32(t7, value_32_128);
+    t7 = _mm_srai_epi32(t7, 6);
+    t7 = _mm_add_epi32(t7, pred_r3);
+
+
+    // 32-bit to 16-bit conversion
+    t0 = _mm_packs_epi32(t4, t5);
+    t1 = _mm_packs_epi32(t6, t7);
+
+
+    //Clipping the results to 8 bits
+    sign_reg_128 = _mm_cmpgt_epi16(t0, zero_8x16b_128);      // sign check
+    t0 = _mm_and_si128(t0, sign_reg_128);
+    sign_reg_128 = _mm_cmpgt_epi16(t1, zero_8x16b_128);
+    t1 = _mm_and_si128(t1, sign_reg_128);
+
+    r0 = _mm_packus_epi16(t0, t1);
+    r1 = _mm_srli_si128(r0, 4);
+    r2 = _mm_srli_si128(r1, 4);
+    r3 = _mm_srli_si128(r2, 4);
+
+    *pu4_out = _mm_cvtsi128_si32(r0);
+    pu1_out += out_strd;
+    pu4_out = (UWORD32 *) (pu1_out);
+    *(pu4_out) = _mm_cvtsi128_si32(r1);
+    pu1_out += out_strd;
+    pu4_out = (UWORD32 *) (pu1_out);
+    *(pu4_out) = _mm_cvtsi128_si32(r2);
+    pu1_out += out_strd;
+    pu4_out = (UWORD32 *) (pu1_out);
+    *(pu4_out) = _mm_cvtsi128_si32(r3);
+}
--- a/common/x86/ih264_weighted_pred_avx2.c
+++ b/common/x86/ih264_weighted_pred_avx2.c
@ -0,0 +1,501 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+#include <immintrin.h>
+#include "ih264_typedefs.h"
+#include "ih264_macros.h"
+#include "ih264_platform_macros.h"
+#include "ih264_weighted_pred.h"
+#include <stdint.h>
+#include <string.h>
+
+#include <stdio.h>
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : ih264_weighted_bi_pred_luma_avx2                         */
+/*                                                                           */
+/*  Description   : This function performs the weighted biprediction as      */
+/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
+/*                  prediction process" for luma. The function gets two      */
+/*                  ht x wd blocks, weights them, adds them, rounds off the  */
+/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
+/*                  stores it in the destination block. (ht,wd) can be       */
+/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
+/*                                                                           */
+/*  Inputs        : pu1_src1  - Pointer to source 1                          */
+/*                  pu1_src2  - Pointer to source 2                          */
+/*                  pu1_dst   - Pointer to destination                       */
+/*                  src_strd1 - stride for source 1                          */
+/*                  src_strd2 - stride for source 2                          */
+/*                  dst_strd2 - stride for destination                       */
+/*                  log_wd    - number of bits to be rounded off             */
+/*                  wt1       - weight value for source 1                    */
+/*                  wt2       - weight value for source 2                    */
+/*                  ofst1     - offset value for source 1                    */
+/*                  ofst2     - offset value for source 2                    */
+/*                  ht        - height of the block                          */
+/*                  wd        - width of the block                           */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         04 02 2015   Kaushik         Initial Version                      */
+/*                      Senthoor                                             */
+/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
+/*****************************************************************************/
+void ih264_weighted_bi_pred_luma_avx2(UWORD8 *pu1_src1,
+                                       UWORD8 *pu1_src2,
+                                       UWORD8 *pu1_dst,
+                                       WORD32 src_strd1,
+                                       WORD32 src_strd2,
+                                       WORD32 dst_strd,
+                                       WORD32 log_wd,
+                                       WORD32 wt1,
+                                       WORD32 wt2,
+                                       WORD32 ofst1,
+                                       WORD32 ofst2,
+                                       WORD32 ht,
+                                       WORD32 wd)
+{
+
+    __m256i wt1_8x32b, wt2_8x32b;
+    __m256i ofst_8x32b, round_8x32b;
+    __m256i zero;
+    zero = _mm256_set1_epi8(0);
+
+    WORD32 ofst;
+    WORD32 round_val, shft;
+
+    wt1 = (WORD16)(wt1 & 0xffff);
+    wt2 = (WORD16)(wt2 & 0xffff);
+    round_val = 1 << log_wd;
+    shft = log_wd + 1;
+    ofst1 = (WORD8)(ofst1 & 0xff);
+    ofst2 = (WORD8)(ofst2 & 0xff);
+    ofst = (ofst1 + ofst2 + 1) >> 1;
+
+    wt1_8x32b = _mm256_set1_epi16(wt1);
+    wt2_8x32b = _mm256_set1_epi16(wt2);
+    round_8x32b = _mm256_set1_epi16(round_val);
+    ofst_8x32b = _mm256_set1_epi16(ofst);
+
+
+    if(wd == 4)
+    {
+        __m128i y1_2_16x8b, y1_3_16x8b;
+        __m128i y2_2_16x8b, y2_3_16x8b;
+
+        __m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b,y2_1_8x32b,y2_0_8x32b;
+        __m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_1_16x8b_128,y1_2_16x8b_128,y1_3_16x8b_128;
+
+        do
+        {
+            y1_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1));
+            y1_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1));
+
+            y2_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2));
+            y2_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2));
+
+            y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
+            y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
+            y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
+            y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);
+
+            y1_0_32x8b = _mm256_unpacklo_epi32(y1_02_32x8b, y1_13_32x8b);
+            y2_0_32x8b = _mm256_unpacklo_epi32(y2_02_32x8b, y2_13_32x8b);
+            y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y1_0_32x8b, 0xD8));
+            y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(y2_0_32x8b, 0xD8));
+
+            y1_0_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128); // 8 to 16
+            y2_0_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);
+
+            y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
+            y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);
+
+            y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);
+
+            y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);
+
+            y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);
+
+            y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_packus_epi16(y1_0_8x32b, y1_0_8x32b));
+            y1_2_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 4);
+            y1_1_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 8);
+            y1_3_16x8b_128 = _mm_srli_si128(y1_0_16x8b_128, 12);
+
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b_128);
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b_128);
+            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b_128);
+            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b_128);
+
+            ht -= 4;
+            pu1_src1 += src_strd1 << 2;
+            pu1_src2 += src_strd2 << 2;
+            pu1_dst += dst_strd << 2;
+        }
+        while(ht > 0);
+    }
+    else if(wd == 8)
+    {
+        __m128i y1_0_16x8b_128,y2_0_16x8b_128,y1_2_16x8b_128,y1_1_16x8b_128,y1_3_16x8b_128;
+        __m256i y1_02_32x8b,y1_13_32x8b,y2_02_32x8b,y2_13_32x8b,y1_0_32x8b,y2_0_32x8b,y1_0_8x32b;
+        __m256i y1_1_8x32b,y2_0_8x32b,y2_1_8x32b;
+
+        do
+        {
+
+            y1_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + (src_strd1 << 1)), (__m128i *)(pu1_src1));
+            y1_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src1 + src_strd1 * 3), (__m128i *)(pu1_src1 + src_strd1));
+
+            y2_02_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + (src_strd2 << 1)), (__m128i *)(pu1_src2));
+            y2_13_32x8b =  _mm256_loadu2_m128i((__m128i *)(pu1_src2 + src_strd2 * 3), (__m128i *)(pu1_src2 + src_strd2));
+
+            y1_02_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, zero);
+            y1_13_32x8b = _mm256_unpacklo_epi64(y1_13_32x8b, zero);
+            y2_02_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, zero);
+            y2_13_32x8b = _mm256_unpacklo_epi64(y2_13_32x8b, zero);
+
+            y1_0_32x8b = _mm256_unpacklo_epi64(y1_02_32x8b, y1_13_32x8b);
+            y2_0_32x8b = _mm256_unpacklo_epi64(y2_02_32x8b, y2_13_32x8b);
+
+            y1_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y1_0_32x8b));
+            y1_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,0x1));
+            y1_1_8x32b = _mm256_cvtepu8_epi16(y1_0_16x8b_128);
+
+            y2_0_8x32b = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y2_0_32x8b));
+            y2_0_16x8b_128 = _mm256_castsi256_si128(_mm256_permute2x128_si256(y2_0_32x8b,y2_0_32x8b,0x1));
+            y2_1_8x32b = _mm256_cvtepu8_epi16(y2_0_16x8b_128);
+
+
+            y1_0_8x32b = _mm256_mullo_epi16(y1_0_8x32b, wt1_8x32b);
+            y2_0_8x32b = _mm256_mullo_epi16(y2_0_8x32b, wt2_8x32b);
+            y1_1_8x32b = _mm256_mullo_epi16(y1_1_8x32b, wt1_8x32b);
+            y2_1_8x32b = _mm256_mullo_epi16(y2_1_8x32b, wt2_8x32b);
+
+            y1_0_8x32b = _mm256_adds_epi16(y1_0_8x32b, y2_0_8x32b);
+            y1_1_8x32b = _mm256_adds_epi16(y1_1_8x32b, y2_1_8x32b);
+
+            y1_0_8x32b = _mm256_srai_epi16(y1_0_8x32b, shft);
+            y1_1_8x32b = _mm256_srai_epi16(y1_1_8x32b, shft);
+
+            y1_0_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0_8x32b);
+            y1_1_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_1_8x32b);
+
+            y1_0_32x8b = _mm256_packus_epi16(y1_0_8x32b, y1_1_8x32b);
+            y1_0_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
+            y1_2_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));
+
+            y1_0_32x8b = _mm256_permute2x128_si256(y1_0_32x8b,y1_0_32x8b,1);
+            y1_1_16x8b_128 = _mm256_castsi256_si128(y1_0_32x8b);
+            y1_3_16x8b_128 = _mm256_castsi256_si128(_mm256_srli_si256(y1_0_32x8b, 8));
+
+            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b_128);
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b_128);
+            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b_128);
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b_128);
+
+            ht -= 4;
+            pu1_src1 += src_strd1 << 2;
+            pu1_src2 += src_strd2 << 2;
+            pu1_dst += dst_strd << 2;
+
+        }
+        while(ht > 0);
+    }
+    else // wd == 16
+    {
+        __m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
+        __m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;
+
+        __m256i zero_32x8b,y1_0_32x8b,y2_0_32x8b;
+        zero_32x8b = _mm256_set1_epi8(0);
+
+        do
+        {
+
+            y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
+            y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);
+
+            y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
+            y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);
+
+            y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b,zero_32x8b);
+            y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);
+
+            y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
+            y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);
+
+            y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
+            y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);
+
+            y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
+            y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);
+
+            y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
+            y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);
+
+            y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
+            y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);
+
+            y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
+            y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);
+
+            y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);
+
+            _mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);
+
+            ht -= 2;
+            pu1_src1 += src_strd1 << 1;
+            pu1_src2 += src_strd2 << 1;
+            pu1_dst += dst_strd << 1;
+        }
+        while(ht > 0);
+    }
+}
+
+
+/*****************************************************************************/
+/*                                                                           */
+/*  Function Name : ih264_weighted_bi_pred_chroma_avx2                       */
+/*                                                                           */
+/*  Description   : This function performs the weighted biprediction as      */
+/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
+/*                  prediction process" for chroma. The function gets two    */
+/*                  ht x wd blocks, weights them, adds them, rounds off the  */
+/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
+/*                  stores it in the destination block. (ht,wd) can be       */
+/*                  (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8).       */
+/*                                                                           */
+/*  Inputs        : pu1_src1  - Pointer to source 1                          */
+/*                  pu1_src2  - Pointer to source 2                          */
+/*                  pu1_dst   - Pointer to destination                       */
+/*                  src_strd1 - stride for source 1                          */
+/*                  src_strd2 - stride for source 2                          */
+/*                  dst_strd2 - stride for destination                       */
+/*                  log_wd    - number of bits to be rounded off             */
+/*                  wt1       - weight values for u and v in source 1        */
+/*                  wt2       - weight values for u and v in source 2        */
+/*                  ofst1     - offset value for u and v in source 1         */
+/*                  ofst2     - offset value for u and v in source 2         */
+/*                  ht        - height of the block                          */
+/*                  wd        - width of the block                           */
+/*                                                                           */
+/*  Issues        : None                                                     */
+/*                                                                           */
+/*  Revision History:                                                        */
+/*                                                                           */
+/*         DD MM YYYY   Author(s)       Changes                              */
+/*         04 02 2015   Kaushik         Initial Version                      */
+/*                      Senthoor                                             */
+/*         15 09 2020   Priyanka Bose   AVX2 Intel Intrinsics Support        */
+/*****************************************************************************/
+void ih264_weighted_bi_pred_chroma_avx2(UWORD8 *pu1_src1,
+                                         UWORD8 *pu1_src2,
+                                         UWORD8 *pu1_dst,
+                                         WORD32 src_strd1,
+                                         WORD32 src_strd2,
+                                         WORD32 dst_strd,
+                                         WORD32 log_wd,
+                                         WORD32 wt1,
+                                         WORD32 wt2,
+                                         WORD32 ofst1,
+                                         WORD32 ofst2,
+                                         WORD32 ht,
+                                         WORD32 wd)
+{
+
+    __m128i y1_0_16x8b, y1_1_16x8b;
+    __m128i y2_0_16x8b, y2_1_16x8b;
+
+    __m128i wt1_8x16b, wt2_8x16b;
+    __m128i ofst_8x16b, round_8x16b;
+
+    WORD32 ofst1_u, ofst2_u, ofst_u;
+    WORD32 ofst1_v, ofst2_v, ofst_v;
+    WORD32 round_val, shft, ofst_val,ofst_val_256;
+
+    round_val = 1 << log_wd;
+    shft = log_wd + 1;
+
+    ofst1_u = (WORD8)(ofst1 & 0xff);
+    ofst1_v = (WORD8)(ofst1 >> 8);
+    ofst2_u = (WORD8)(ofst2 & 0xff);
+    ofst2_v = (WORD8)(ofst2 >> 8);
+
+    wt1_8x16b = _mm_set1_epi32(wt1);
+    wt2_8x16b = _mm_set1_epi32(wt2);
+
+    ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
+    ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
+    ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
+    ofst_val_256 = (ofst_u & 0xffff) | (ofst_v << 16);
+
+    round_8x16b = _mm_set1_epi16(round_val);
+    ofst_8x16b =  _mm_set1_epi32(ofst_val);
+
+    if(wd == 2)
+    {
+        __m128i y1_0_8x16b, y2_0_8x16b;
+
+        do
+        {
+            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);                 //Loading 64 bits from diff location
+            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
+
+            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
+            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
+
+            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
+            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
+
+            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
+            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
+
+            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
+            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
+
+            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
+            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
+
+            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
+            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
+
+            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
+            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
+
+            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
+            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
+
+            ht -= 2;
+            pu1_src1 += src_strd1 << 1;
+            pu1_src2 += src_strd2 << 1;
+            pu1_dst += dst_strd << 1;
+        }
+        while(ht > 0);
+    }
+    else if(wd == 4)
+    {
+        __m128i y1_0_8x16b, y1_1_8x16b;
+        __m128i y2_0_8x16b, y2_1_8x16b;
+
+        do
+        {
+            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);                  //Loading 64 bits from diff location
+            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
+
+            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
+            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
+
+            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
+            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
+
+            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
+            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
+
+            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
+            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
+            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
+            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
+
+            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
+            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
+
+            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
+            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
+
+            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
+            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
+
+            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
+            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
+
+            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
+            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
+
+            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
+            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
+
+            ht -= 2;
+            pu1_src1 += src_strd1 << 1;
+            pu1_src2 += src_strd2 << 1;
+            pu1_dst += dst_strd << 1;
+        }
+        while(ht > 0);
+    }
+    else // wd == 8
+    {
+        __m256i y1_0L_8x32b, y1_0H_8x32b, y1_1L_8x32b, y1_1H_8x32b;
+        __m256i y2_0L_8x32b, y2_0H_8x32b, y2_1L_8x32b, y2_1H_8x32b;
+        __m256i y1_0_32x8b,y2_0_32x8b,ofst_8x32b,round_8x32b;
+        __m256i wt1_8x32b, wt2_8x32b;
+        __m256i zero_32x8b;
+
+        wt1_8x32b = _mm256_set1_epi16(wt1);
+        wt2_8x32b = _mm256_set1_epi16(wt2);
+        round_8x32b = _mm256_set1_epi16(round_val);
+        ofst_8x32b = _mm256_set1_epi32(ofst_val_256);
+        zero_32x8b = _mm256_set1_epi8(0);
+
+        do
+        {
+            y1_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src1);
+            y2_0_32x8b = _mm256_loadu_si256((__m256i *)pu1_src2);
+            y1_0L_8x32b = _mm256_unpacklo_epi8(y1_0_32x8b, zero_32x8b);
+            y1_0H_8x32b = _mm256_unpackhi_epi8(y1_0_32x8b, zero_32x8b);
+            y2_0L_8x32b = _mm256_unpacklo_epi8(y2_0_32x8b, zero_32x8b);
+            y2_0H_8x32b = _mm256_unpackhi_epi8(y2_0_32x8b, zero_32x8b);
+            y1_0L_8x32b = _mm256_mullo_epi16(y1_0L_8x32b, wt1_8x32b);
+            y1_0H_8x32b = _mm256_mullo_epi16(y1_0H_8x32b, wt1_8x32b);
+
+            y2_0L_8x32b = _mm256_mullo_epi16(y2_0L_8x32b, wt2_8x32b);
+            y2_0H_8x32b = _mm256_mullo_epi16(y2_0H_8x32b, wt2_8x32b);
+
+            y1_0L_8x32b = _mm256_adds_epi16(y1_0L_8x32b, y2_0L_8x32b);
+            y1_0H_8x32b = _mm256_adds_epi16(y1_0H_8x32b, y2_0H_8x32b);
+
+            y1_0L_8x32b = _mm256_adds_epi16(round_8x32b, y1_0L_8x32b);
+            y1_0H_8x32b = _mm256_adds_epi16(round_8x32b, y1_0H_8x32b);
+
+            y1_0L_8x32b = _mm256_srai_epi16(y1_0L_8x32b, shft);
+            y1_0H_8x32b = _mm256_srai_epi16(y1_0H_8x32b, shft);
+
+            y1_0L_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0L_8x32b);
+            y1_0H_8x32b = _mm256_adds_epi16(ofst_8x32b, y1_0H_8x32b);
+
+
+            y1_0_32x8b = _mm256_packus_epi16(y1_0L_8x32b, y1_0H_8x32b);
+            _mm256_storeu_si256((__m256i *)pu1_dst, y1_0_32x8b);
+
+            ht -= 2;
+            pu1_src1 += src_strd1 << 1;
+            pu1_src2 += src_strd2 << 1;
+            pu1_dst += dst_strd << 1;
+        }
+        while(ht > 0);
+    }
+}
--- a/decoder/ih264d_function_selector.h
+++ b/decoder/ih264d_function_selector.h
@ -67,5 +67,6 @@ void ih264d_init_function_ptr_sse42(dec_struct_t *ps_codec);

 void ih264d_init_function_ptr_a9q(dec_struct_t *ps_codec);
 void ih264d_init_function_ptr_av8(dec_struct_t *ps_codec);
+void ih264d_init_function_ptr_avx2(dec_struct_t *ps_codec);

 #endif /* _IH264D_FUNCTION_SELECTOR_H_ */
--- a/decoder/libavcdec.cmake
+++ b/decoder/libavcdec.cmake
@ -46,7 +46,8 @@ else()
  list(
    APPEND LIBAVCDEC_SRCS "${AVC_ROOT}/decoder/x86/ih264d_function_selector.c"
    "${AVC_ROOT}/decoder/x86/ih264d_function_selector_sse42.c"
-    "${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c")
+    "${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c"
+    "${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c")
 endif()

 add_library(libavcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS}
--- a/decoder/mvc/libmvcdec.cmake
+++ b/decoder/mvc/libmvcdec.cmake
@ -54,7 +54,8 @@ else()
  list(
    APPEND LIBMVCDEC_ASMS "${AVC_ROOT}/decoder/x86/ih264d_function_selector.c"
    "${AVC_ROOT}/decoder/x86/ih264d_function_selector_sse42.c"
-    "${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c")
+    "${AVC_ROOT}/decoder/x86/ih264d_function_selector_ssse3.c"
+    "${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c")
 endif()

 add_library(libmvcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS}
--- a/decoder/svc/libsvcdec.cmake
+++ b/decoder/svc/libsvcdec.cmake
@ -95,7 +95,8 @@ else()
    "${AVC_ROOT}/decoder/x86/svc/isvcd_iquant_itrans_residual_sse42.c"
    "${AVC_ROOT}/decoder/x86/svc/isvcd_iquant_itrans_sse42.c"
    "${AVC_ROOT}/decoder/x86/svc/isvcd_pred_residual_recon_sse42.c"
-    "${AVC_ROOT}/decoder/x86/svc/isvcd_residual_resamp_sse42.c")
+    "${AVC_ROOT}/decoder/x86/svc/isvcd_residual_resamp_sse42.c"
+    "${AVC_ROOT}/decoder/x86/ih264d_function_selector_avx2.c")
 endif()

 add_library(libsvcdec STATIC ${LIBAVC_COMMON_SRCS} ${LIBAVC_COMMON_ASMS}
--- a/decoder/x86/ih264d_function_selector.c
+++ b/decoder/x86/ih264d_function_selector.c
@ -52,6 +52,7 @@
 #include "ih264_error.h"
 #include "ih264_trans_quant_itrans_iquant.h"
 #include "ih264_inter_pred_filters.h"
+#include "ih264_deblk_edge_filters.h"

 #include "ih264d_structs.h"
 #include "ih264d_function_selector.h"
@ -68,6 +69,11 @@ void ih264d_init_function_ptr(dec_struct_t *ps_codec)
        case ARCH_X86_SSSE3:
            ih264d_init_function_ptr_ssse3(ps_codec);
            break;
+        case ARCH_X86_AVX2:
+            ih264d_init_function_ptr_ssse3(ps_codec);
+            ih264d_init_function_ptr_sse42(ps_codec);
+            ih264d_init_function_ptr_avx2(ps_codec);
+            break;
        case ARCH_X86_SSE42:
        default:
            ih264d_init_function_ptr_ssse3(ps_codec);
@ -83,7 +89,7 @@ void ih264d_init_arch(dec_struct_t *ps_codec)
 #elif DEFAULT_ARCH == D_ARCH_X86_SSSE3
    ps_codec->e_processor_arch = ARCH_X86_SSSE3;
 #elif DEFAULT_ARCH == D_ARCH_X86_AVX2
-    ps_codec->e_processor_arch = D_ARCH_X86_AVX2;
+    ps_codec->e_processor_arch = ARCH_X86_AVX2;
 #else
    ps_codec->e_processor_arch = ARCH_X86_GENERIC;
 #endif
--- a/decoder/x86/ih264d_function_selector_avx2.c
+++ b/decoder/x86/ih264d_function_selector_avx2.c
@ -0,0 +1,102 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+*  ih264e_function_selector_generic.c
+*
+* @brief
+*  Contains functions to initialize function pointers of codec context
+*
+* @author
+*  Ittiam
+*
+* @par List of Functions:
+*  - ih264e_init_function_ptr_generic
+*
+* @remarks
+*  None
+*
+*******************************************************************************
+*/
+
+
+/*****************************************************************************/
+/* File Includes                                                             */
+/*****************************************************************************/
+
+/* System Include files */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* User Include files */
+#include "ih264_typedefs.h"
+#include "iv.h"
+#include "ivd.h"
+#include "ih264_defs.h"
+#include "ih264_size_defs.h"
+#include "ih264_error.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include "ih264_inter_pred_filters.h"
+#include "ih264_trans_quant_itrans_iquant.h"
+#include  "ih264_weighted_pred.h"
+#include "ih264d_structs.h"
+
+#include "ih264_deblk_edge_filters.h"
+/**
+*******************************************************************************
+*
+* @brief Initialize the intra/inter/transform/deblk function pointers of
+* codec context
+*
+* @par Description: the current routine initializes the function pointers of
+* codec context basing on the architecture in use
+*
+* @param[in] ps_codec
+*  Codec context pointer
+*
+* @returns  none
+*
+* @remarks none
+*
+*******************************************************************************
+*/
+void ih264d_init_function_ptr_avx2(dec_struct_t *ps_codec)
+{
+
+     UNUSED(ps_codec);
+     ps_codec->pf_deblk_chroma_vert_bslt4  =   ih264_deblk_chroma_vert_bslt4_avx2;
+     ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_avx2;
+     ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_avx2;
+     //ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_avx2;
+     ps_codec->apf_inter_pred_luma[5] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
+     ps_codec->apf_inter_pred_luma[7] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
+     ps_codec->apf_inter_pred_luma[13] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
+     ps_codec->apf_inter_pred_luma[15] = ih264_inter_pred_luma_horz_qpel_vert_qpel_avx2;
+     ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_avx2;
+     ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_avx2;
+     ps_codec->apf_inter_pred_luma[0] = ih264_inter_pred_luma_copy_avx2;
+     ps_codec->pf_iquant_itrans_recon_luma_4x4 = ih264_iquant_itrans_recon_4x4_avx2;
+     ps_codec->pf_weighted_bi_pred_luma = ih264_weighted_bi_pred_luma_avx2;
+     ps_codec->pf_weighted_bi_pred_chroma = ih264_weighted_bi_pred_chroma_avx2;
+    return;
+}